diff -Nru rocfft-5.5.0/.github/dependabot.yml rocfft-5.7.1/.github/dependabot.yml
--- rocfft-5.5.0/.github/dependabot.yml	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/.github/dependabot.yml	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,12 @@
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+
+version: 2
+updates:
+  - package-ecosystem: "pip" # See documentation for possible values
+    directory: "/docs/.sphinx" # Location of package manifests
+    open-pull-requests-limit: 10
+    schedule:
+      interval: "daily"
diff -Nru rocfft-5.5.0/.gitignore rocfft-5.7.1/.gitignore
--- rocfft-5.5.0/.gitignore	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/.gitignore	2023-08-09 16:19:51.000000000 +0000
@@ -40,3 +40,12 @@
 
 # python bytecode
 __pycache__
+
+# documentation artifacts
+_build/
+_images/
+_static/
+_templates/
+_toc.yml
+docBin/
+_doxygen/
diff -Nru rocfft-5.5.0/.jenkins/application.groovy rocfft-5.7.1/.jenkins/application.groovy
--- rocfft-5.5.0/.jenkins/application.groovy	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/.jenkins/application.groovy	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,182 @@
+#!/usr/bin/env groovy
+// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
+@Library('rocJenkins@pong') _
+
+// This is file for internal AMD use.
+// If you are interested in running your own Jenkins, please raise a github issue for assistance.
+
+import com.amd.project.*
+import com.amd.docker.*
+import java.nio.file.Path
+
+def runCI = 
+{
+    nodeDetails, jobName->
+
+    def prj  = new rocProject('rocFFT-internal', 'application')
+
+    prj.defaults.ccache = true
+    prj.timeout.compile = 600
+    prj.timeout.test = 600
+    prj.libraryDependencies = ['rocFFT', 'hipFFT']
+
+    // Define test architectures, optional rocm version argument is available
+    def nodes = new dockerNodes(nodeDetails, jobName, prj)
+
+    boolean formatCheck = false
+
+    def commonGroovy
+
+    def compileCommand =
+    {
+        platform, project->
+        def getDependenciesCommand = ""
+        if (project.installLibraryDependenciesFromCI)
+        {
+            project.libraryDependencies.each
+            { libraryName ->
+                getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, false)
+            }
+        } 
+
+        def command = """#!/usr/bin/env bash
+                         set -ex
+                         cd ${project.paths.project_build_prefix}
+                         ${getDependenciesCommand}
+                         git clone -b develop-2021 https://github.com/ROCmSoftwarePlatform/Gromacs.git
+                         cd Gromacs
+                         
+                         mkdir build_tmpi
+                         cd build_tmpi
+                         cmake -DCMAKE_HIP_ARCHITECTURES=gfx90a -DBUILD_SHARED_LIBS=ON -DGMX_BUILD_FOR_COVERAGE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DGMX_MPI=OFF -DGMX_GPU=hip -DGMX_OPENMP=ON -DGMX_SIMD=AVX2_256 -DREGRESSIONTEST_DOWNLOAD=OFF -DGMX_GPU_USE_VKFFT=OFF -DCMAKE_PREFIX_PATH=/opt/rocm -DCMAKE_INSTALL_PREFIX=../gromacs-install ..
+                         make
+                         make install
+                         cd ..
+
+                         mkdir build_mpi
+                         cd build_mpi
+                         cmake -DCMAKE_HIP_ARCHITECTURES=gfx908 -DBUILD_SHARED_LIBS=ON -DGMX_BUILD_FOR_COVERAGE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpic++ -DGMX_MPI=ON -DGMX_GPU=hip -DGMX_OPENMP=ON -DGMX_SIMD=AVX2_256 -DREGRESSIONTEST_DOWNLOAD=OFF -DGMX_GPU_USE_VKFFT=OFF -DCMAKE_PREFIX_PATH=/opt/rocm -DCMAKE_INSTALL_PREFIX=../gromacs-install ..
+                         make
+                         make install
+                         cd ..
+                      """
+        platform.runCommand(this, command)
+    }
+
+    def testCommand =
+    {
+        platform, project->
+        
+        def command = """#!/usr/bin/env bash
+                         set -ex
+                         cd ${project.paths.project_build_prefix}
+                         cd Gromacs
+
+                         source gromacs-install/bin/GMXRC
+                         gmx --version
+
+                         export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/opt/rocm/lib
+                         echo \$LD_LIBRARY_PATH
+
+                         git clone https://github.com/jychang48/benchmark-gromacs.git
+                         cd benchmark-gromacs
+
+                         export GMX_MAXBACKUP=-1
+
+                         echo "* Threaded MPI ******************************************************************************************************"
+
+                         #ADH_DODEC
+                         cd adh_dodec
+                         tar zxf adh_dodec.tar.gz
+                         gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 100               # 1 GPU
+                         gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200      # 2 GPUs   
+                         gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 200    # 4 GPUs
+                         gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 150 # 8 GPUs
+                         
+                         # STMV
+                         cd ..
+                         cd stmv/
+                         tar zxf stmv.tar.gz
+                         gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 200               # 1 GPU
+                         gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200      # 2 GPUs
+                         gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 400     # 4 GPUs
+                         gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 400 # 8 GPUs
+ 
+                         # CELLULOSE_NVE
+                         cd ..
+                         cd cellulose_nve/
+                         tar zxf cellulose_nve.tar.gz
+                         gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 100               # 1 GPU
+                         gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200      # 2 GPUs
+                         gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 200     # 4 GPUs
+                         gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 200 # 8 GPUs
+
+                         echo "* MPI ***************************************************************************************************************" 
+ 
+                         # ADH_DODEC
+                         cd ..
+                         cd adh_dodec/
+                         tar zxf adh_dodec.tar.gz
+                         mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr                # 1 GPU
+                         mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr        # 2 GPUs
+                         mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr      # 4 GPUs
+                         mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr  # 8 GPUs
+ 
+                         # STMV
+                         cd ..
+                         cd stmv/
+                         tar zxf stmv.tar.gz
+                         mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -nstlist 400 -gpu_id 0 -s topol.tpr   # 1 GPU
+                         mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr        # 2 GPUs
+                         mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr      # 4 GPUs
+                         mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr  # 8 GPUs
+ 
+                         # CELLULOSE_NVE
+                         cd ..
+                         cd cellulose_nve/
+                         tar zxf cellulose_nve.tar.gz
+                         mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr                # 1 GPU
+                         mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr        # 2 GPUs
+                         mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr      # 4 GPUs
+                         mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr  # 8 GPUs
+                      """
+        platform.runCommand(this, command)
+    }
+
+    buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, null)
+}
+
+ci: { 
+    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
+
+    def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 5')])]]
+    propertyList = auxiliary.appendPropertyList(propertyList)
+
+    def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu20:['8gfx90a']])]
+    jobNameList = auxiliary.appendJobNameList(jobNameList)
+
+    propertyList.each 
+    {
+        jobName, property->
+        if (urlJobName == jobName)
+            properties(auxiliary.addCommonProperties(property))
+    }
+
+    jobNameList.each 
+    {
+        jobName, nodeDetails->
+        if (urlJobName == jobName)
+            stage(jobName) {
+                runCI(nodeDetails, jobName)
+            }
+    }
+
+    // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
+    if(!jobNameList.keySet().contains(urlJobName))
+    {
+        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
+        stage(urlJobName) {
+            runCI([ubuntu18:['8gfx90a']], urlJobName)
+        }
+    }
+}
diff -Nru rocfft-5.5.0/.jenkins/common.groovy rocfft-5.7.1/.jenkins/common.groovy
--- rocfft-5.5.0/.jenkins/common.groovy	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/.jenkins/common.groovy	2023-08-09 16:19:51.000000000 +0000
@@ -12,14 +12,14 @@
         { libraryName ->
             getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, false)
         }
-    }            
+    }
 
-    String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_RIDER=ON -DBUILD_FFTW=ON'
+    String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_RIDER=ON'
     String warningArgs = '-DWERROR=ON'
+    String buildTunerArgs = '-DROCFFT_BUILD_OFFLINE_TUNER=ON'
     String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug -DROCFFT_DEVICE_FORCE_RELEASE=ON' : '-DCMAKE_BUILD_TYPE=Release'
     String buildTypeDir = debug ? 'debug' : 'release'
     String staticArg = buildStatic ? '-DBUILD_SHARED_LIBS=off' : ''
-    String hipClangArgs = jobName.contains('hipclang') ? '-DUSE_HIP_CLANG=ON -DHIP_COMPILER=clang' : ''
     String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
     //Set CI node's gfx arch as target if PR, otherwise use default targets of the library
     String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : ''
@@ -32,7 +32,7 @@
                 set -e
                 mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir}
                 ${auxiliary.gfxTargetParser()}
-                ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc ${buildTypeArg} ${clientArgs} ${warningArgs} ${hipClangArgs} ${staticArg} ${amdgpuTargets} ${rtcBuildCache} ../..
+                ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc ${buildTypeArg} ${clientArgs} ${warningArgs} ${buildTunerArgs} ${staticArg} ${amdgpuTargets} ${rtcBuildCache} ../..
                 make -j\$(nproc)
                 sudo make install
             """
@@ -46,12 +46,11 @@
 
     project.paths.construct_build_prefix()
 
-    String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_RIDER=ON -DBUILD_GTEST=ON -DBUILD_FFTW=ON'
+    String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_RIDER=ON'
     String warningArgs = '-DWERROR=ON'
     String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug -DROCFFT_DEVICE_FORCE_RELEASE=ON' : '-DCMAKE_BUILD_TYPE=Release'
     String buildTypeDir = debug ? 'debug' : 'release'
     //String staticArg = buildStatic ? '-DBUILD_SHARED_LIBS=off' : ''
-    String hipClangArgs = jobName.contains('hipclang') ? '-DUSE_HIP_CLANG=ON -DHIP_COMPILER=clang' : ''
     String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
     String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : ''
 
@@ -62,7 +61,7 @@
                 set -ex
                 cd ${project.paths.project_build_prefix}/clients
                 mkdir -p build && cd build
-                ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc ${buildTypeArgClients} ${hipClangArgs} ${cmakePrefixPathArg} ../
+                ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc ${buildTypeArgClients} ${cmakePrefixPathArg} ../
                 make -j\$(nproc)
             """
     platform.runCommand(this, command)
@@ -88,6 +87,15 @@
     def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/${directory}",false)
     platform.runCommand(this, packageHelper[0])
     platform.archiveArtifacts(this, packageHelper[1])
+
+    //trim temp files
+    def command = """#!/usr/bin/env bash
+                     set -ex
+                     cd ${project.paths.project_build_prefix}/build/${directory}/
+                     rm -rf _CPack_Packages/
+                     find -name '*.o' -delete
+                  """
+    platform.runCommand(this, command)
 }
 
 def runSubsetBuildCommand(platform, project, jobName, genPattern, genSmall, genLarge, boolean onlyDouble)
@@ -106,7 +114,6 @@
     String precisionArgs = onlyDouble ? '-DGENERATOR_PRECISION=double' : ''
     String kernelArgs = "${genPatternArgs} ${manualSmallArgs} ${manualLargeArgs} ${precisionArgs}"
 
-    String hipClangArgs = jobName.contains('hipclang') ? '-DUSE_HIP_CLANG=ON -DHIP_COMPILER=clang' : ''
     String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
     //Set CI node's gfx arch as target if PR, otherwise use default targets of the library
     String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : ''
@@ -119,7 +126,7 @@
                 rm -rf build/${buildTypeDir}
                 mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir}
                 ${auxiliary.gfxTargetParser()}
-                ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc ${buildTypeArg} ${clientArgs} ${kernelArgs} ${warningArgs} ${hipClangArgs} ${amdgpuTargets} ${rtcBuildCache} ../..
+                ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc ${buildTypeArg} ${clientArgs} ${kernelArgs} ${warningArgs} ${amdgpuTargets} ${rtcBuildCache} ../..
                 make -j\$(nproc)
             """
     platform.runCommand(this, command)
diff -Nru rocfft-5.5.0/.jenkins/debug.groovy rocfft-5.7.1/.jenkins/debug.groovy
--- rocfft-5.5.0/.jenkins/debug.groovy	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/.jenkins/debug.groovy	2023-08-09 16:19:51.000000000 +0000
@@ -18,7 +18,7 @@
     prj.defaults.ccache = true
     prj.timeout.compile = 600
     prj.timeout.test = 600
-    prj.libraryDependencies = ['rocRAND']
+    prj.libraryDependencies = ['rocRAND','hipRAND']
 
     // Define test architectures, optional rocm version argument is available
     def nodes = new dockerNodes(nodeDetails, jobName, prj)
diff -Nru rocfft-5.5.0/.jenkins/performance.groovy rocfft-5.7.1/.jenkins/performance.groovy
--- rocfft-5.5.0/.jenkins/performance.groovy	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/.jenkins/performance.groovy	2023-08-09 16:19:51.000000000 +0000
@@ -28,12 +28,11 @@
        git branch: "${reference}", url: 'https://github.com/ROCmSoftwarePlatform/rocFFT.git'
     }
 
-    String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_RIDER=ON -DBUILD_FFTW=OFF'
-    String noclientArgs = '-DBUILD_CLIENTS_SAMPLES=OFF -DBUILD_CLIENTS_TESTS=OFF -DBUILD_CLIENTS_RIDER=OFF -DBUILD_FFTW=OFF'
+    String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_RIDER=ON'
+    String noclientArgs = '-DBUILD_CLIENTS_SAMPLES=OFF -DBUILD_CLIENTS_TESTS=OFF -DBUILD_CLIENTS_RIDER=OFF'
     String warningArgs = '-DWERROR=ON'
     String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug -DROCFFT_DEVICE_FORCE_RELEASE=ON' : '-DCMAKE_BUILD_TYPE=Release'
     String buildTypeDir = debug ? 'debug' : 'release'
-    String hipClangArgs = jobName.contains('hipclang') ? '-DUSE_HIP_CLANG=ON -DHIP_COMPILER=clang' : ''
     String rtcBuildCache = "-DROCFFT_BUILD_KERNEL_CACHE_PATH=\$JENKINS_HOME_DIR/rocfft_build_cache.db"
     String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
 
@@ -44,13 +43,13 @@
                 set -e
                 mkdir -p build/${buildTypeDir} && pushd build/${buildTypeDir}
                 ${auxiliary.gfxTargetParser()}
-                ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc -DAMDGPU_TARGETS=\$gfx_arch -DSINGLELIB=on ${buildTypeArg} ${clientArgs} ${warningArgs} ${hipClangArgs} ${rtcBuildCache} ../..
+                ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc -DAMDGPU_TARGETS=\$gfx_arch -DSINGLELIB=on ${buildTypeArg} ${clientArgs} ${warningArgs} ${rtcBuildCache} ../..
                 make -j\$(nproc)
                 popd
                 cd ref-repo
                 mkdir -p build/${buildTypeDir} && pushd build/${buildTypeDir}
                 ${auxiliary.gfxTargetParser()}
-                ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc -DAMDGPU_TARGETS=\$gfx_arch -DSINGLELIB=on ${buildTypeArg} ${noclientArgs} ${warningArgs} ${hipClangArgs} ${rtcBuildCache} ../..
+                ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc -DAMDGPU_TARGETS=\$gfx_arch -DSINGLELIB=on ${buildTypeArg} ${noclientArgs} ${warningArgs} ${rtcBuildCache} ../..
                 make -j\$(nproc)
             """
     platform.runCommand(this, command)
@@ -89,41 +88,39 @@
                     reportTitles: "${dataType}-precision-${platform.gpu}"])
     }
 
-    if (platform.gpu != 'gfx90a')
+    
+    withCredentials([gitUsernamePassword(credentialsId: 'GitHub-ROCmMathLibrariesBot-Token', gitToolName: 'git-tool')])
     {
-        withCredentials([gitUsernamePassword(credentialsId: 'GitHub-ROCmMathLibrariesBot-Token', gitToolName: 'git-tool')])
-        {
-            platform.runCommand(
-                this,
-                """
-                cd ${project.paths.build_prefix}
-                git clone https://github.com/ROCmSoftwarePlatform/rocPTS.git -b release/rocpts-rel-1.0
-                cd rocPTS
-                python3 -m pip install build
-                python3 -m build
-                python3 -m pip install .
-                """
-            )
-        }
-        writeFile(
-            file: project.paths.project_build_prefix + "/record_pts.py",
-            text: libraryResource("com/amd/scripts/record_pts.py"))
-        def setupBranch = env.CHANGE_ID ? "git branch \$BRANCH_NAME" : ""
-        def command = """#!/usr/bin/env bash
-        set -ex
-        cd ${project.paths.project_build_prefix}
-        ${setupBranch}
-        git checkout \$BRANCH_NAME
-        benchmark_folder=rocFFT_Benchmark_Dataset_\$(date +%Y%m%d)
-        mkdir -p \${benchmark_folder}/all_change \${benchmark_folder}/all_ref
-        cp -uf ./*_change/* \${benchmark_folder}/all_change
-        cp -uf ./*_ref/* \${benchmark_folder}/all_ref
-        python3 ./record_pts.py --dataset-path \$PWD/\${benchmark_folder} --reference-dataset all_ref --new-dataset all_change -v 5.3 -l pts_rocfft_benchmark_data
-        """
-        withCredentials([usernamePassword(credentialsId: 'PTS_API_ID_KEY_PROD', usernameVariable: 'PTS_API_ID', passwordVariable: 'PTS_API_KEY')])
-        {
-            platform.runCommand(this, command)
-        }
+        platform.runCommand(
+            this,
+            """
+            cd ${project.paths.build_prefix}
+            git clone https://github.com/ROCmSoftwarePlatform/rocPTS.git -b release/rocpts-rel-1.1.0
+            cd rocPTS
+            python3 -m pip install build
+            python3 -m build
+            python3 -m pip install .
+            """
+        )
+    }
+    writeFile(
+        file: project.paths.project_build_prefix + "/record_pts.py",
+        text: libraryResource("com/amd/scripts/record_pts.py"))
+    def setupBranch = env.CHANGE_ID ? "git branch \$BRANCH_NAME" : ""
+    def command = """#!/usr/bin/env bash
+    set -ex
+    cd ${project.paths.project_build_prefix}
+    ${setupBranch}
+    git checkout \$BRANCH_NAME
+    benchmark_folder=rocFFT_Benchmark_Dataset_\$(date +%Y%m%d)
+    mkdir -p \${benchmark_folder}/all_change \${benchmark_folder}/all_ref
+    cp -uf ./*_change/* \${benchmark_folder}/all_change
+    cp -uf ./*_ref/* \${benchmark_folder}/all_ref
+    python3 ./record_pts.py --dataset-path \$PWD/\${benchmark_folder} --reference-dataset all_ref --new-dataset all_change -v 5.5 -l pts_rocfft_benchmark_data-v1.0.0
+    """
+    withCredentials([usernamePassword(credentialsId: 'PTS_API_ID_KEY_PROD', usernameVariable: 'PTS_API_ID', passwordVariable: 'PTS_API_KEY')])
+    {
+        platform.runCommand(this, command)
     }
 }
 
@@ -136,7 +133,7 @@
     prj.defaults.ccache = true
     prj.timeout.compile = 600
     prj.timeout.test = 600
-    prj.libraryDependencies = ['rocRAND']
+    prj.libraryDependencies = ['rocRAND','hipRAND']
 
     // Define test architectures, optional rocm version argument is available
     def nodes = new dockerNodes(nodeDetails, jobName, prj)
diff -Nru rocfft-5.5.0/.jenkins/precheckin.groovy rocfft-5.7.1/.jenkins/precheckin.groovy
--- rocfft-5.5.0/.jenkins/precheckin.groovy	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/.jenkins/precheckin.groovy	2023-08-09 16:19:51.000000000 +0000
@@ -18,7 +18,7 @@
     prj.defaults.ccache = true
     prj.timeout.compile = 600
     prj.timeout.test = 600
-    prj.libraryDependencies = ['rocRAND']
+    prj.libraryDependencies = ['rocRAND','hipRAND']
 
     // Define test architectures, optional rocm version argument is available
     def nodes = new dockerNodes(nodeDetails, jobName, prj)
diff -Nru rocfft-5.5.0/.jenkins/staticanalysis.groovy rocfft-5.7.1/.jenkins/staticanalysis.groovy
--- rocfft-5.5.0/.jenkins/staticanalysis.groovy	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/.jenkins/staticanalysis.groovy	2023-08-09 16:19:51.000000000 +0000
@@ -13,13 +13,6 @@
 {
     project.paths.construct_build_prefix()
 
-    def command = """#!/usr/bin/env bash
-            set -x
-            ${project.paths.project_build_prefix}/docs/run_doc.sh
-            """
-
-    platform.runCommand(this, command)
-
     def yapfCommand = """#!/usr/bin/env bash
                          set -x
                          cd ${project.paths.project_build_prefix}
@@ -30,14 +23,6 @@
                       """
 
     platform.runCommand(this, yapfCommand)
-    
-    publishHTML([allowMissing: false,
-                alwaysLinkToLastBuild: false,
-                keepAll: false,
-                reportDir: "${project.paths.project_build_prefix}/docs/source/_build/html",
-                reportFiles: "index.html",
-                reportName: "Documentation",
-                reportTitles: "Documentation"])
 }
 
 def runCI =
@@ -45,7 +30,7 @@
     nodeDetails, jobName->
 
     def prj  = new rocProject('rocFFT-internal', 'StaticAnalysis')
-    prj.libraryDependencies = ['rocRAND']
+    prj.libraryDependencies = ['rocRAND','hipRAND']
 
     // Define test architectures, optional rocm version argument is available
     def nodes = new dockerNodes(nodeDetails, jobName, prj)
diff -Nru rocfft-5.5.0/.jenkins/staticlibrary.groovy rocfft-5.7.1/.jenkins/staticlibrary.groovy
--- rocfft-5.5.0/.jenkins/staticlibrary.groovy	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/.jenkins/staticlibrary.groovy	2023-08-09 16:19:51.000000000 +0000
@@ -18,7 +18,7 @@
     prj.defaults.ccache = true
     prj.timeout.compile = 600
     prj.timeout.test = 600
-    prj.libraryDependencies = ['rocRAND']
+    prj.libraryDependencies = ['rocRAND','hipRAND']
 
     // Define test architectures, optional rocm version argument is available
     def nodes = new dockerNodes(nodeDetails, jobName, prj)
diff -Nru rocfft-5.5.0/.readthedocs.yaml rocfft-5.7.1/.readthedocs.yaml
--- rocfft-5.5.0/.readthedocs.yaml	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/.readthedocs.yaml	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,14 @@
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+version: 2
+
+sphinx:
+   configuration: docs/conf.py
+
+formats: [htmlzip]
+
+python:
+   version: "3.8"
+   install:
+   - requirements: docs/.sphinx/requirements.txt
diff -Nru rocfft-5.5.0/CHANGELOG.md rocfft-5.7.1/CHANGELOG.md
--- rocfft-5.5.0/CHANGELOG.md	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/CHANGELOG.md	2023-08-09 16:19:51.000000000 +0000
@@ -2,6 +2,35 @@
 
 Full documentation for rocFFT is available at [rocfft.readthedocs.io](https://rocfft.readthedocs.io/en/latest/).
 
+## rocFFT 1.0.24 for ROCm 5.7.0
+
+### Optimizations
+- Improved performance of complex forward/inverse 1D FFTs (2049 <= length <= 131071) that use Bluestein's algorithm.
+
+### Added
+- Implemented a solution map version converter and finish the first conversion from ver.0 to ver.1. Where version 1 removes some incorrect kernels (sbrc/sbcr using half_lds)
+
+### Changed
+
+- Moved rocfft_rtc_helper executable to lib/rocFFT directory on Linux.
+- Moved library kernel cache to lib/rocFFT directory.
+
+## rocFFT 1.0.23 for ROCm 5.6.0
+
+### Added
+- Implemented half-precision transforms, which can be requested by passing rocfft_precision_half to rocfft_plan_create.
+- Implemented a hierarchical solution map which saves how to decompose a problem and the kernels to be used.
+- Implemented a first version of offline-tuner to support tuning kernels for C2C/Z2Z problems.
+
+### Changed
+- Replaced std::complex with hipComplex data types for data generator.
+- FFT plan dimensions are now sorted to be row-major internally where possible, which produces better plans if the dimensions were accidentally specified in a different order (column-major, for example).
+- Added --precision argument to benchmark/test clients.  --double is still accepted but is deprecated as a method to request a double-precision transform.
+- Improved performance test suite statistical framework.
+
+### Fixed
+- Fixed over-allocation of LDS in some real-complex kernels, which was resulting in kernel launch failure.
+
 ## rocFFT 1.0.22 for ROCm 5.5.0
 
 ### Optimizations
@@ -43,8 +72,8 @@
 - Added gfx1100 and gfx1102 to default AMDGPU_TARGETS.
 
 ### Changed
-- Moved runtime compilation cache to in-memory by default.  A default on-disk cache can encounter contention problems 
-on multi-node clusters with a shared filesystem.  rocFFT can still be told to use an on-disk cache by setting the 
+- Moved runtime compilation cache to in-memory by default.  A default on-disk cache can encounter contention problems
+on multi-node clusters with a shared filesystem.  rocFFT can still be told to use an on-disk cache by setting the
 ROCFFT_RTC_CACHE_PATH environment variable.
 
 ## rocFFT 1.0.18 for ROCm 5.3.0
diff -Nru rocfft-5.5.0/CMakeLists.txt rocfft-5.7.1/CMakeLists.txt
--- rocfft-5.5.0/CMakeLists.txt	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/CMakeLists.txt	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 # #############################################################################
-# Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -22,6 +22,9 @@
 
 cmake_minimum_required( VERSION 3.16 )
 
+# We use C++17 features, this will add compile option: -std=c++17
+set( CMAKE_CXX_STANDARD 17 )
+
 # This should appear before the project command, because it does not
 # use FORCE
 if( WIN32 )
@@ -46,16 +49,6 @@
 
 project( rocfft LANGUAGES CXX C )
 
-# Control hip-clang use:
-set( USE_HIP_CLANG OFF CACHE BOOL "Use hip-clang to build for amdgpu" )
-if( USE_HIP_CLANG )
-  message( STATUS "Use hip-clang to build for amdgpu backend" )
-  set( HIP_PLATFORM "hip-clang" )
-  set( HIP_COMPILER "clang" )
-else()
-  set( HIP_PLATFORM "hcc" )
-endif()
-
 # This finds the rocm-cmake project, and installs it if not found
 # rocm-cmake contains common cmake code for rocm projects to help setup and install
 set( PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern )
@@ -98,7 +91,7 @@
 include( ROCMHeaderWrapper )
 
 # Using standardized versioning from rocm-cmake
-set ( VERSION_STRING "1.0.21" )
+set ( VERSION_STRING "1.0.23" )
 rocm_setup_version( VERSION ${VERSION_STRING} )
 
 # Append our library helper cmake path and the cmake path for hip (for
@@ -123,6 +116,10 @@
 option(ROCFFT_RUNTIME_COMPILE "Enable runtime compilation of kernels" ON)
 option(ROCFFT_RUNTIME_COMPILE_DEFAULT "Compile kernels at runtime by default" OFF)
 
+# Using -DROCFFT_BUILD_OFFLINE_TUNER=ON to compile an executable,
+# Set default to OFF since users are not likely to tune
+option(ROCFFT_BUILD_OFFLINE_TUNER "Build with offline tuner executable rocfft_offline_tuner" OFF)
+
 if(BUILD_ADDRESS_SANITIZER)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -shared-libasan")
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -shared-libasan")
@@ -147,7 +144,7 @@
 
 # Use target ID syntax if supported for AMDGPU_TARGETS
 rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
-  TARGETS "gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx1102")
+  TARGETS "gfx803;gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
 set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target")
 list(LENGTH AMDGPU_TARGETS AMDGPU_TARGETS_LENGTH)
 
@@ -206,40 +203,40 @@
   if(BUILD_CLIENTS_TESTS OR BUILD_CLIENTS_SELFTEST OR BUILD_CLIENTS_RIDER)
     find_package( Boost COMPONENTS program_options REQUIRED)
     set(BOOST_DEB "libboost-program-options${Boost_VERSION_MAJOR}.${Boost_VERSION_MINOR}.${Boost_VERSION_PATCH}")
-    set(BOOST_RPM "boost-devel = ${Boost_VERSION}")
+    set(BOOST_RPM "boost-program-options = ${Boost_VERSION_MAJOR}.${Boost_VERSION_MINOR}.${Boost_VERSION_PATCH}")
   endif()
   if( NOT CLIENTS_OS )
     rocm_set_os_id( CLIENTS_OS )
   endif()
   if(BUILD_CLIENTS_TESTS AND (NOT DEFINED BUILD_CLIENTS_TESTS_OPENMP OR BUILD_CLIENTS_TESTS_OPENMP))
     set(OPENMP_DEB "libgomp1")
+    set(FFTW_DEB "libfftw3-bin")
     if(CLIENTS_OS STREQUAL "sles")
       set(OPENMP_RPM "libgomp1")
+      set(FFTW_RPM "libfftw3-3")
     else()
       set(OPENMP_RPM "libgomp")
+      set(FFTW_RPM "fftw-libs")
     endif()
   endif()
   if(CLIENTS_OS STREQUAL "sles")
-    set(BOOST_RPM RPM "libboost_program_options${Boost_VERSION_MAJOR}_${Boost_VERSION_MINOR}_${Boost_VERSION_PATCH}-devel")
+    set(BOOST_RPM RPM "libboost_program_options${Boost_VERSION_MAJOR}_${Boost_VERSION_MINOR}_${Boost_VERSION_PATCH}")
   endif()
   rocm_package_setup_component(clients)
-  rocm_package_setup_client_component(clients-common)
   if(BUILD_CLIENTS_TESTS)
     rocm_package_setup_client_component(
       tests
       DEPENDS
-        COMPONENT clients-common
-        DEB ${BOOST_DEB} ${OPENMP_DEB}
-        RPM ${BOOST_RPM} ${OPENMP_RPM}
+        DEB ${BOOST_DEB} ${OPENMP_DEB} ${FFTW_DEB} rocrand
+        RPM ${BOOST_RPM} ${OPENMP_RPM} ${FFTW_RPM} rocrand
     )
   endif()
   if(BUILD_CLIENTS_RIDER)
     rocm_package_setup_client_component(
       benchmarks
       DEPENDS
-        COMPONENT clients-common
-        DEB ${BOOST_DEB}
-        RPM ${BOOST_RPM}
+        DEB ${BOOST_DEB} rocrand
+        RPM ${BOOST_RPM} rocrand
     )
     rocm_install(
       DIRECTORY scripts/perf
diff -Nru rocfft-5.5.0/LICENSE.md rocfft-5.7.1/LICENSE.md
--- rocfft-5.5.0/LICENSE.md	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/LICENSE.md	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
  
 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 
diff -Nru rocfft-5.5.0/README.md rocfft-5.7.1/README.md
--- rocfft-5.5.0/README.md	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/README.md	2023-08-09 16:19:51.000000000 +0000
@@ -34,12 +34,11 @@
 
 A static library can be compiled by using the option `-DBUILD_SHARED_LIBS=off`
 
-To use the [hip-clang compiler][3], one must specify
-`-DUSE_HIP_CLANG=ON -DHIP_COMPILER=clang` to cmake.  rocFFT enables
-use of indirect function calls by default and requires ROCm 4.3 or
-higher to build successfully.  `-DROCFFT_CALLBACKS_ENABLED=off`
-may be specified to cmake to disable those calls on older ROCm
-compilers, though callbacks will not work correctly in this configuration.
+rocFFT enables use of indirect function calls by default and requires
+ROCm 4.3 or higher to build successfully.
+`-DROCFFT_CALLBACKS_ENABLED=off` may be specified to cmake to disable
+those calls on older ROCm compilers, though callbacks will not work
+correctly in this configuration.
 
 There are several clients included with rocFFT:
 1. rocfft-rider runs general transforms and is useful for performance analysis;
@@ -83,6 +82,18 @@
 
 Please refer to the [library documentation][4] for current documentation.
 
+### How to build documentation
+
+Please follow the steps below to build the documentation.
+
+```
+cd docs
+
+pip3 install -r .sphinx/requirements.txt
+
+python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
+```
+
 ## Examples
 
 Examples may be found in the [clients/samples][5] subdirectory.
diff -Nru rocfft-5.5.0/clients/data_gen.h rocfft-5.7.1/clients/data_gen.h
--- rocfft-5.5.0/clients/data_gen.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/data_gen.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,1153 +0,0 @@
-// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-#ifndef DATA_GEN_H
-#define DATA_GEN_H
-
-#include "../shared/arithmetic.h"
-#include "../shared/gpubuf.h"
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime.h>
-#include <hip/hip_runtime_api.h>
-#include <hip/hip_vector_types.h>
-#include <rocrand/rocrand.h>
-#include <rocrand/rocrand_kernel.h>
-#include <vector>
-
-static const unsigned int DATA_GEN_THREADS = 32;
-
-template <typename T>
-struct input_val_1D
-{
-    T val1;
-};
-
-template <typename T>
-struct input_val_2D
-{
-    T val1;
-    T val2;
-};
-
-template <typename T>
-struct input_val_3D
-{
-    T val1;
-    T val2;
-    T val3;
-};
-
-template <typename T>
-static input_val_1D<T> get_input_val(const T& val)
-{
-    return input_val_1D<T>{val};
-}
-
-template <typename T>
-static input_val_2D<T> get_input_val(const std::tuple<T, T>& val)
-{
-    return input_val_2D<T>{std::get<0>(val), std::get<1>(val)};
-}
-
-template <typename T>
-static input_val_3D<T> get_input_val(const std::tuple<T, T, T>& val)
-{
-    return input_val_3D<T>{std::get<0>(val), std::get<1>(val), std::get<2>(val)};
-}
-
-template <typename T>
-__device__ static size_t
-    compute_index(const input_val_1D<T>& length, const input_val_1D<T>& stride, size_t base)
-{
-    return (length.val1 * stride.val1) + base;
-}
-
-template <typename T>
-__device__ static size_t
-    compute_index(const input_val_2D<T>& length, const input_val_2D<T>& stride, size_t base)
-{
-    return (length.val1 * stride.val1) + (length.val2 * stride.val2) + base;
-}
-
-template <typename T>
-__device__ static size_t
-    compute_index(const input_val_3D<T>& length, const input_val_3D<T>& stride, size_t base)
-{
-    return (length.val1 * stride.val1) + (length.val2 * stride.val2) + (length.val3 * stride.val3)
-           + base;
-}
-
-template <typename T>
-static inline input_val_1D<T> make_zero_length(const input_val_1D<T>& whole_length)
-{
-    return input_val_1D<T>{0};
-}
-
-template <typename T>
-static inline input_val_2D<T> make_zero_length(const input_val_2D<T>& whole_length)
-{
-    return input_val_2D<T>{0, 0};
-}
-
-template <typename T>
-static inline input_val_3D<T> make_zero_length(const input_val_3D<T>& whole_length)
-{
-    return input_val_3D<T>{0, 0, 0};
-}
-
-template <typename T>
-__device__ static input_val_1D<T> get_length(const size_t i, const input_val_1D<T>& whole_length)
-{
-    auto xlen = whole_length.val1;
-
-    auto xidx = i % xlen;
-
-    return input_val_1D<T>{xidx};
-}
-
-template <typename T>
-__device__ static size_t get_batch(const size_t i, const input_val_1D<T>& whole_length)
-{
-    auto xlen = whole_length.val1;
-
-    auto yidx = i / xlen;
-
-    return yidx;
-}
-
-template <typename T>
-__device__ static input_val_2D<T> get_length(const size_t i, const input_val_2D<T>& whole_length)
-{
-    auto xlen = whole_length.val1;
-    auto ylen = whole_length.val2;
-
-    auto xidx = i % xlen;
-    auto yidx = i / xlen % ylen;
-
-    return input_val_2D<T>{xidx, yidx};
-}
-
-template <typename T>
-__device__ static size_t get_batch(const size_t i, const input_val_2D<T>& whole_length)
-{
-    auto xlen = whole_length.val1;
-    auto ylen = whole_length.val2;
-
-    auto zidx = i / xlen / ylen;
-
-    return zidx;
-}
-
-template <typename T>
-__device__ static input_val_3D<T> get_length(const size_t i, const input_val_3D<T>& whole_length)
-{
-    auto xlen = whole_length.val1;
-    auto ylen = whole_length.val2;
-    auto zlen = whole_length.val3;
-
-    auto xidx = i % xlen;
-    auto yidx = i / xlen % ylen;
-    auto zidx = i / xlen / ylen % zlen;
-
-    return input_val_3D<T>{xidx, yidx, zidx};
-}
-
-template <typename T>
-__device__ static size_t get_batch(const size_t i, const input_val_3D<T>& length)
-{
-    auto xlen = length.val1;
-    auto ylen = length.val2;
-    auto zlen = length.val3;
-
-    auto widx = i / xlen / ylen / zlen;
-
-    return widx;
-}
-
-template <typename T1>
-__global__ static void __launch_bounds__(DATA_GEN_THREADS)
-    generate_float_interleaved_data_kernel(const T1             whole_length,
-                                           const T1             zero_length,
-                                           size_t               idist,
-                                           size_t               isize,
-                                           const T1             istride,
-                                           std::complex<float>* data)
-{
-    auto const i = threadIdx.x + blockIdx.x * blockDim.x;
-    if(i < isize)
-    {
-        auto i_length = get_length(i, whole_length);
-        auto i_batch  = get_batch(i, whole_length);
-        auto i_base   = i_batch * idist;
-
-        auto seed = compute_index(zero_length, istride, i_base);
-        auto idx  = compute_index(i_length, istride, i_base);
-
-        rocrand_state_philox4x32_10 gen_state;
-        rocrand_init(seed, idx, 0, &gen_state);
-
-        auto item = rocrand_uniform2(&gen_state);
-
-        data[idx] = std::complex<float>(item.x, item.y);
-    }
-}
-
-template <typename T1>
-__global__ static void __launch_bounds__(DATA_GEN_THREADS)
-    generate_double_interleaved_data_kernel(const T1              whole_length,
-                                            const T1              zero_length,
-                                            size_t                idist,
-                                            size_t                isize,
-                                            const T1              istride,
-                                            std::complex<double>* data)
-{
-    auto const i = threadIdx.x + blockIdx.x * blockDim.x;
-    if(i < isize)
-    {
-        auto i_length = get_length(i, whole_length);
-        auto i_batch  = get_batch(i, whole_length);
-        auto i_base   = i_batch * idist;
-
-        auto seed = compute_index(zero_length, istride, i_base);
-        auto idx  = compute_index(i_length, istride, i_base);
-
-        rocrand_state_philox4x32_10 gen_state;
-        rocrand_init(seed, idx, 0, &gen_state);
-
-        auto item = rocrand_uniform_double2(&gen_state);
-
-        data[idx] = std::complex<double>(item.x, item.y);
-    }
-}
-
-template <typename T1>
-__global__ static void __launch_bounds__(DATA_GEN_THREADS)
-    generate_float_planar_data_kernel(const T1 whole_length,
-                                      const T1 zero_length,
-                                      size_t   idist,
-                                      size_t   isize,
-                                      const T1 istride,
-                                      float*   real_data,
-                                      float*   imag_data)
-{
-    auto const i = threadIdx.x + blockIdx.x * blockDim.x;
-    if(i < isize)
-    {
-        auto i_length = get_length(i, whole_length);
-        auto i_batch  = get_batch(i, whole_length);
-        auto i_base   = i_batch * idist;
-
-        auto seed = compute_index(zero_length, istride, i_base);
-        auto idx  = compute_index(i_length, istride, i_base);
-
-        rocrand_state_philox4x32_10 gen_state;
-        rocrand_init(seed, idx, 0, &gen_state);
-
-        auto item = rocrand_uniform2(&gen_state);
-
-        real_data[idx] = item.x;
-        imag_data[idx] = item.y;
-    }
-}
-
-template <typename T1>
-__global__ static void __launch_bounds__(DATA_GEN_THREADS)
-    generate_double_planar_data_kernel(const T1 whole_length,
-                                       const T1 zero_length,
-                                       size_t   idist,
-                                       size_t   isize,
-                                       const T1 istride,
-                                       double*  real_data,
-                                       double*  imag_data)
-{
-    auto const i = threadIdx.x + blockIdx.x * blockDim.x;
-    if(i < isize)
-    {
-        auto i_length = get_length(i, whole_length);
-        auto i_batch  = get_batch(i, whole_length);
-        auto i_base   = i_batch * idist;
-
-        auto seed = compute_index(zero_length, istride, i_base);
-        auto idx  = compute_index(i_length, istride, i_base);
-
-        rocrand_state_philox4x32_10 gen_state;
-        rocrand_init(seed, idx, 0, &gen_state);
-
-        auto item = rocrand_uniform_double2(&gen_state);
-
-        real_data[idx] = item.x;
-        imag_data[idx] = item.y;
-    }
-}
-
-template <typename T1>
-__global__ static void __launch_bounds__(DATA_GEN_THREADS)
-    generate_float_real_data_kernel(const T1 whole_length,
-                                    const T1 zero_length,
-                                    size_t   idist,
-                                    size_t   isize,
-                                    const T1 istride,
-                                    float*   data)
-{
-    auto const i = threadIdx.x + blockIdx.x * blockDim.x;
-    if(i < isize)
-    {
-        auto i_length = get_length(i, whole_length);
-        auto i_batch  = get_batch(i, whole_length);
-        auto i_base   = i_batch * idist;
-
-        auto seed = compute_index(zero_length, istride, i_base);
-        auto idx  = compute_index(i_length, istride, i_base);
-
-        rocrand_state_philox4x32_10 gen_state;
-        rocrand_init(seed, idx, 0, &gen_state);
-
-        data[idx] = rocrand_uniform(&gen_state);
-    }
-}
-
-template <typename T1>
-__global__ static void __launch_bounds__(DATA_GEN_THREADS)
-    generate_double_real_data_kernel(const T1 whole_length,
-                                     const T1 zero_length,
-                                     size_t   idist,
-                                     size_t   isize,
-                                     const T1 istride,
-                                     double*  data)
-{
-    auto const i = threadIdx.x + blockIdx.x * blockDim.x;
-    if(i < isize)
-    {
-        auto i_length = get_length(i, whole_length);
-        auto i_batch  = get_batch(i, whole_length);
-        auto i_base   = i_batch * idist;
-
-        auto seed = compute_index(zero_length, istride, i_base);
-        auto idx  = compute_index(i_length, istride, i_base);
-
-        rocrand_state_philox4x32_10 gen_state;
-        rocrand_init(seed, idx, 0, &gen_state);
-
-        data[idx] = rocrand_uniform_double(&gen_state);
-    }
-}
-
-// For complex-to-real transforms, the input data must be Hermitiam-symmetric.
-// That is, u_k is the complex conjugate of u_{-k}, where k is the wavevector in Fourier
-// space.  For multi-dimensional data, this means that we only need to store a bit more
-// than half of the complex values; the rest are redundant.  However, there are still
-// some restrictions:
-// * the origin and Nyquist value(s) must be real-valued
-// * some of the remaining values are still redundant, and you might get different results
-//   than you expect if the values don't agree.
-// Below are some example kernels which impose Hermitian symmetry on a complex array
-// of the given dimensions.
-
-// Kernels for imposing Hermitian symmetry on 1D
-// complex (interleaved/planar) data on the GPU.
-
-template <typename Tfloat>
-__global__ static void __launch_bounds__(DATA_GEN_THREADS)
-    impose_hermitian_symmetry_interleaved_1(std::complex<Tfloat>* x,
-                                            const size_t          Nx,
-                                            const size_t          xstride,
-                                            const size_t          dist,
-                                            const size_t          nbatch,
-                                            const bool            Nxeven)
-{
-    auto idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if(idx < nbatch)
-    {
-        idx *= dist;
-
-        // The DC mode must be real-valued.
-        x[idx].imag(0);
-
-        if(Nxeven)
-        {
-            // Nyquist mode
-            auto pos = idx + (Nx / 2) * xstride;
-            x[pos].imag(0);
-        }
-    }
-}
-
-template <typename Tfloat>
-__global__ static void __launch_bounds__(DATA_GEN_THREADS)
-    impose_hermitian_symmetry_planar_1(Tfloat*      xreal,
-                                       Tfloat*      ximag,
-                                       const size_t Nx,
-                                       const size_t xstride,
-                                       const size_t dist,
-                                       const size_t nbatch,
-                                       const bool   Nxeven)
-{
-    auto idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if(idx < nbatch)
-    {
-        idx *= dist;
-
-        // The DC mode must be real-valued.
-        ximag[idx] = 0;
-
-        if(Nxeven)
-        {
-            // Nyquist mode
-            auto pos   = idx + (Nx / 2) * xstride;
-            ximag[pos] = 0;
-        }
-    }
-}
-
-// Kernels for imposing Hermitian symmetry on 2D
-// complex (interleaved/planar) data on the GPU.
-
-template <typename Tfloat>
-__global__ static void __launch_bounds__(DATA_GEN_THREADS* DATA_GEN_THREADS)
-    impose_hermitian_symmetry_interleaved_2(std::complex<Tfloat>* x,
-                                            const size_t          Nx,
-                                            const size_t          Ny,
-                                            const size_t          xstride,
-                                            const size_t          ystride,
-                                            const size_t          dist,
-                                            const size_t          nbatch,
-                                            const bool            Nxeven,
-                                            const bool            Nyeven)
-{
-    auto       idx = blockIdx.y * blockDim.y + threadIdx.y;
-    const auto idy = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if(idy < (Ny / 2 + 1) && idx < nbatch)
-    {
-        idx *= dist;
-
-        auto pos  = idx + idy * ystride;
-        auto cpos = idx + ((Ny - idy) % Ny) * ystride;
-
-        auto val = x[pos];
-
-        // DC mode:
-        if(idy == 0)
-            val.imag(0);
-
-        // Axes need to be symmetrized:
-        if(idy > 0 && idy < (Ny + 1) / 2)
-            val = std::conj(val);
-
-        // y-Nyquist
-        if(Nyeven && idy == Ny / 2)
-            val.imag(0);
-
-        x[cpos] = val;
-
-        if(Nxeven)
-        {
-            pos += (Nx / 2) * xstride;
-            cpos += (Nx / 2) * xstride;
-
-            val = x[pos];
-
-            // DC mode:
-            if(idy == 0)
-                val.imag(0);
-
-            // Axes need to be symmetrized:
-            if(idy > 0 && idy < (Ny + 1) / 2)
-                val = std::conj(val);
-
-            // y-Nyquist
-            if(Nyeven && idy == Ny / 2)
-                val.imag(0);
-
-            x[cpos] = val;
-        }
-    }
-}
-
-template <typename Tfloat>
-__global__ static void __launch_bounds__(DATA_GEN_THREADS* DATA_GEN_THREADS)
-    impose_hermitian_symmetry_planar_2(Tfloat*      xreal,
-                                       Tfloat*      ximag,
-                                       const size_t Nx,
-                                       const size_t Ny,
-                                       const size_t xstride,
-                                       const size_t ystride,
-                                       const size_t dist,
-                                       const size_t nbatch,
-                                       const bool   Nxeven,
-                                       const bool   Nyeven)
-{
-    auto       idx = blockIdx.y * blockDim.y + threadIdx.y;
-    const auto idy = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if(idy < (Ny / 2 + 1) && idx < nbatch)
-    {
-        idx *= dist;
-
-        auto pos  = idx + idy * ystride;
-        auto cpos = idx + ((Ny - idy) % Ny) * ystride;
-
-        auto valreal = xreal[pos];
-        auto valimag = ximag[pos];
-
-        // DC mode:
-        if(idy == 0)
-            valimag = 0;
-
-        // Axes need to be symmetrized:
-        if(idy > 0 && idy < (Ny + 1) / 2)
-            valimag = -valimag;
-
-        // y-Nyquist
-        if(Nyeven && idy == Ny / 2)
-            valimag = 0;
-
-        xreal[cpos] = valreal;
-        ximag[cpos] = valimag;
-
-        if(Nxeven)
-        {
-            pos += (Nx / 2) * xstride;
-            cpos += (Nx / 2) * xstride;
-
-            valreal = xreal[pos];
-            valimag = ximag[pos];
-
-            // DC mode:
-            if(idy == 0)
-                valimag = 0;
-
-            // Axes need to be symmetrized:
-            if(idy > 0 && idy < (Ny + 1) / 2)
-                valimag = -valimag;
-
-            // y-Nyquist
-            if(Nyeven && idy == Ny / 2)
-                valimag = 0;
-
-            xreal[cpos] = valreal;
-            ximag[cpos] = valimag;
-        }
-    }
-}
-
-// Kernels for imposing Hermitian symmetry on 3D
-// complex (interleaved/planar) data on the GPU.
-
-template <typename Tfloat>
-__global__ static void __launch_bounds__(DATA_GEN_THREADS* DATA_GEN_THREADS* DATA_GEN_THREADS)
-    impose_hermitian_symmetry_interleaved_3(std::complex<Tfloat>* x,
-                                            const size_t          Nx,
-                                            const size_t          Ny,
-                                            const size_t          Nz,
-                                            const size_t          xstride,
-                                            const size_t          ystride,
-                                            const size_t          zstride,
-                                            const size_t          dist,
-                                            const size_t          nbatch,
-                                            const bool            Nxeven,
-                                            const bool            Nyeven,
-                                            const bool            Nzeven)
-{
-    const auto idy = blockIdx.x * blockDim.x + threadIdx.x;
-    const auto idz = blockIdx.y * blockDim.y + threadIdx.y;
-    auto       idx = blockIdx.z * blockDim.z + threadIdx.z;
-
-    if(idy < Ny && idz < Nz && idx < nbatch)
-    {
-        idx *= dist;
-
-        auto pos  = idx + idy * ystride + idz * zstride;
-        auto cpos = idx + ((Ny - idy) % Ny) * ystride + ((Nz - idz) % Nz) * zstride;
-
-        // Origin
-        if(idy == 0 && idz == 0)
-        {
-            x[pos].imag(0);
-        }
-
-        // y-Nyquist
-        if(Nyeven && idy == Ny / 2 && idz == 0)
-        {
-            x[pos].imag(0);
-        }
-
-        // z-Nyquist
-        if(Nzeven && idz == Nz / 2 && idy == 0)
-        {
-            x[pos].imag(0);
-        }
-
-        // yz-Nyquist
-        if(Nyeven && Nzeven && idy == Ny / 2 && idz == Nz / 2)
-        {
-            x[pos].imag(0);
-        }
-
-        // z-axis
-        if(idy == 0 && idz > 0 && idz < (Nz + 1) / 2)
-            x[cpos] = std::conj(x[pos]);
-
-        // y-Nyquist axis
-        if(Nyeven && idy == Ny / 2 && idz > 0 && idz < (Nz + 1) / 2)
-            x[cpos] = std::conj(x[pos]);
-
-        // y-axis
-        if(idy > 0 && idy < (Ny + 1) / 2 && idz == 0)
-            x[cpos] = std::conj(x[pos]);
-
-        // z-Nyquist axis
-        if(Nzeven && idz == Nz / 2 && idy > 0 && idy < (Ny + 1) / 2)
-            x[cpos] = std::conj(x[pos]);
-
-        // yz plane
-        if(idy > 0 && idy < (Ny + 1) / 2 && idz > 0 && idz < Nz)
-            x[cpos] = std::conj(x[pos]);
-
-        if(Nxeven)
-        {
-            pos += (Nx / 2) * xstride;
-            cpos += (Nx / 2) * xstride;
-            // Origin
-            if(idy == 0 && idz == 0)
-                x[pos].imag(0);
-
-            // y-Nyquist
-            if(Nyeven && idy == Ny / 2 && idz == 0)
-                x[pos].imag(0);
-
-            // z-Nyquist
-            if(Nzeven && idz == Nz / 2 && idy == 0)
-                x[pos].imag(0);
-
-            // yz-Nyquist
-            if(Nyeven && Nzeven && idy == Ny / 2 && idz == Nz / 2)
-                x[pos].imag(0);
-
-            // z-axis
-            if(idy == 0 && idz > 0 && idz < (Nz + 1) / 2)
-                x[cpos] = std::conj(x[pos]);
-
-            // y-Nyquist axis
-            if(Nyeven && idy == Ny / 2 && idz > 0 && idz < (Nz + 1) / 2)
-                x[cpos] = std::conj(x[pos]);
-
-            // y-axis
-            if(idy > 0 && idy < (Ny + 1) / 2 && idz == 0)
-                x[cpos] = std::conj(x[pos]);
-
-            // z-Nyquist axis
-            if(Nzeven && idz == Nz / 2 && idy > 0 && idy < (Ny + 1) / 2)
-                x[cpos] = std::conj(x[pos]);
-
-            // yz plane
-            if(idy > 0 && idy < (Ny + 1) / 2 && idz > 0 && idz < Nz)
-                x[cpos] = std::conj(x[pos]);
-        }
-    }
-}
-
-template <typename Tfloat>
-__global__ static void __launch_bounds__(DATA_GEN_THREADS* DATA_GEN_THREADS* DATA_GEN_THREADS)
-    impose_hermitian_symmetry_planar_3(Tfloat*      xreal,
-                                       Tfloat*      ximag,
-                                       const size_t Nx,
-                                       const size_t Ny,
-                                       const size_t Nz,
-                                       const size_t xstride,
-                                       const size_t ystride,
-                                       const size_t zstride,
-                                       const size_t dist,
-                                       const size_t nbatch,
-                                       const bool   Nxeven,
-                                       const bool   Nyeven,
-                                       const bool   Nzeven)
-{
-    const auto idy = blockIdx.x * blockDim.x + threadIdx.x;
-    const auto idz = blockIdx.y * blockDim.y + threadIdx.y;
-    auto       idx = blockIdx.z * blockDim.z + threadIdx.z;
-
-    if(idy < Ny && idz < Nz && idx < nbatch)
-    {
-        idx *= dist;
-
-        auto pos  = idx + idy * ystride + idz * zstride;
-        auto cpos = idx + ((Ny - idy) % Ny) * ystride + ((Nz - idz) % Nz) * zstride;
-
-        // Origin
-        if(idy == 0 && idz == 0)
-        {
-            ximag[pos] = 0;
-        }
-
-        // y-Nyquist
-        if(Nyeven && idy == Ny / 2 && idz == 0)
-        {
-            ximag[pos] = 0;
-        }
-
-        // z-Nyquist
-        if(Nzeven && idz == Nz / 2 && idy == 0)
-        {
-            ximag[pos] = 0;
-        }
-
-        // yz-Nyquist
-        if(Nyeven && Nzeven && idy == Ny / 2 && idz == Nz / 2)
-        {
-            ximag[pos] = 0;
-        }
-
-        // z-axis
-        if(idy == 0 && idz > 0 && idz < (Nz + 1) / 2)
-        {
-            xreal[cpos] = xreal[pos];
-            ximag[cpos] = -ximag[pos];
-        }
-
-        // y-Nyquist axis
-        if(Nyeven && idy == Ny / 2 && idz > 0 && idz < (Nz + 1) / 2)
-        {
-            xreal[cpos] = xreal[pos];
-            ximag[cpos] = -ximag[pos];
-        }
-
-        // y-axis
-        if(idy > 0 && idy < (Ny + 1) / 2 && idz == 0)
-        {
-            xreal[cpos] = xreal[pos];
-            ximag[cpos] = -ximag[pos];
-        }
-
-        // z-Nyquist axis
-        if(Nzeven && idz == Nz / 2 && idy > 0 && idy < (Ny + 1) / 2)
-        {
-            xreal[cpos] = xreal[pos];
-            ximag[cpos] = -ximag[pos];
-        }
-
-        // yz plane
-        if(idy > 0 && idy < (Ny + 1) / 2 && idz > 0 && idz < Nz)
-        {
-            xreal[cpos] = xreal[pos];
-            ximag[cpos] = -ximag[pos];
-        }
-
-        if(Nxeven)
-        {
-            pos += (Nx / 2) * xstride;
-            cpos += (Nx / 2) * xstride;
-            // Origin
-            if(idy == 0 && idz == 0)
-                ximag[pos] = 0;
-
-            // y-Nyquist
-            if(Nyeven && idy == Ny / 2 && idz == 0)
-                ximag[pos] = 0;
-
-            // z-Nyquist
-            if(Nzeven && idz == Nz / 2 && idy == 0)
-                ximag[pos] = 0;
-
-            // yz-Nyquist
-            if(Nyeven && Nzeven && idy == Ny / 2 && idz == Nz / 2)
-                ximag[pos] = 0;
-
-            // z-axis
-            if(idy == 0 && idz > 0 && idz < (Nz + 1) / 2)
-            {
-                xreal[cpos] = xreal[pos];
-                ximag[cpos] = -ximag[pos];
-            }
-
-            // y-Nyquist axis
-            if(Nyeven && idy == Ny / 2 && idz > 0 && idz < (Nz + 1) / 2)
-            {
-                xreal[cpos] = xreal[pos];
-                ximag[cpos] = -ximag[pos];
-            }
-
-            // y-axis
-            if(idy > 0 && idy < (Ny + 1) / 2 && idz == 0)
-            {
-                xreal[cpos] = xreal[pos];
-                ximag[cpos] = -ximag[pos];
-            }
-
-            // z-Nyquist axis
-            if(Nzeven && idz == Nz / 2 && idy > 0 && idy < (Ny + 1) / 2)
-            {
-                xreal[cpos] = xreal[pos];
-                ximag[cpos] = -ximag[pos];
-            }
-
-            // yz plane
-            if(idy > 0 && idy < (Ny + 1) / 2 && idz > 0 && idz < Nz)
-            {
-                xreal[cpos] = xreal[pos];
-                ximag[cpos] = -ximag[pos];
-            }
-        }
-    }
-}
-
-template <typename Tint>
-inline void generate_interleaved_data(const Tint&          whole_length,
-                                      const size_t         idist,
-                                      const size_t         isize,
-                                      const Tint&          istride,
-                                      std::complex<float>* input_data)
-{
-    auto blockSize       = DATA_GEN_THREADS;
-    auto numBlocks_setup = DivRoundingUp<size_t>(isize, blockSize);
-
-    auto input_length = get_input_val(whole_length);
-    auto zero_length  = make_zero_length(input_length);
-    auto input_stride = get_input_val(istride);
-
-    hipLaunchKernelGGL(generate_float_interleaved_data_kernel,
-                       dim3(numBlocks_setup),
-                       dim3(blockSize),
-                       0, // sharedMemBytes
-                       0, // stream
-                       input_length,
-                       zero_length,
-                       idist,
-                       isize,
-                       input_stride,
-                       input_data);
-}
-
-template <typename Tint>
-inline void generate_interleaved_data(const Tint&           whole_length,
-                                      const size_t          idist,
-                                      const size_t          isize,
-                                      const Tint&           istride,
-                                      std::complex<double>* input_data)
-{
-    auto blockSize       = DATA_GEN_THREADS;
-    auto numBlocks_setup = DivRoundingUp<size_t>(isize, blockSize);
-
-    auto input_length = get_input_val(whole_length);
-    auto zero_length  = make_zero_length(input_length);
-    auto input_stride = get_input_val(istride);
-
-    hipLaunchKernelGGL(generate_double_interleaved_data_kernel,
-                       dim3(numBlocks_setup),
-                       dim3(blockSize),
-                       0, // sharedMemBytes
-                       0, // stream
-                       input_length,
-                       zero_length,
-                       idist,
-                       isize,
-                       input_stride,
-                       input_data);
-}
-
-template <typename Tint>
-inline void generate_planar_data(const Tint&  whole_length,
-                                 const size_t idist,
-                                 const size_t isize,
-                                 const Tint&  istride,
-                                 float*       real_data,
-                                 float*       imag_data)
-{
-    auto blockSize       = DATA_GEN_THREADS;
-    auto numBlocks_setup = DivRoundingUp<size_t>(isize, blockSize);
-
-    auto input_length = get_input_val(whole_length);
-    auto zero_length  = make_zero_length(input_length);
-    auto input_stride = get_input_val(istride);
-
-    hipLaunchKernelGGL(generate_float_planar_data_kernel,
-                       dim3(numBlocks_setup),
-                       dim3(blockSize),
-                       0, // sharedMemBytes
-                       0, // stream
-                       input_length,
-                       zero_length,
-                       idist,
-                       isize,
-                       input_stride,
-                       real_data,
-                       imag_data);
-}
-
-template <typename Tint>
-inline void generate_planar_data(const Tint&  whole_length,
-                                 const size_t idist,
-                                 const size_t isize,
-                                 const Tint&  istride,
-                                 double*      real_data,
-                                 double*      imag_data)
-{
-    auto blockSize       = DATA_GEN_THREADS;
-    auto numBlocks_setup = DivRoundingUp<size_t>(isize, blockSize);
-
-    auto input_length = get_input_val(whole_length);
-    auto zero_length  = make_zero_length(input_length);
-    auto input_stride = get_input_val(istride);
-
-    hipLaunchKernelGGL(generate_double_planar_data_kernel,
-                       dim3(numBlocks_setup),
-                       dim3(blockSize),
-                       0, // sharedMemBytes
-                       0, // stream
-                       input_length,
-                       zero_length,
-                       idist,
-                       isize,
-                       input_stride,
-                       real_data,
-                       imag_data);
-}
-
-template <typename Tint>
-inline void generate_real_data(const Tint&  whole_length,
-                               const size_t idist,
-                               const size_t isize,
-                               const Tint&  istride,
-                               float*       input_data)
-{
-    auto blockSize       = DATA_GEN_THREADS;
-    auto numBlocks_setup = DivRoundingUp<size_t>(isize, blockSize);
-
-    auto input_length = get_input_val(whole_length);
-    auto zero_length  = make_zero_length(input_length);
-    auto input_stride = get_input_val(istride);
-
-    hipLaunchKernelGGL(generate_float_real_data_kernel,
-                       dim3(numBlocks_setup),
-                       dim3(blockSize),
-                       0, // sharedMemBytes
-                       0, // stream
-                       input_length,
-                       zero_length,
-                       idist,
-                       isize,
-                       input_stride,
-                       input_data);
-}
-
-template <typename Tint>
-inline void generate_real_data(const Tint&  whole_length,
-                               const size_t idist,
-                               const size_t isize,
-                               const Tint&  istride,
-                               double*      input_data)
-{
-    auto blockSize       = DATA_GEN_THREADS;
-    auto numBlocks_setup = DivRoundingUp<size_t>(isize, blockSize);
-
-    auto input_length = get_input_val(whole_length);
-    auto zero_length  = make_zero_length(input_length);
-    auto input_stride = get_input_val(istride);
-
-    hipLaunchKernelGGL(generate_double_real_data_kernel,
-                       dim3(numBlocks_setup),
-                       dim3(blockSize),
-                       0, // sharedMemBytes
-                       0, // stream
-                       input_length,
-                       zero_length,
-                       idist,
-                       isize,
-                       input_stride,
-                       input_data);
-}
-
-template <typename Tfloat>
-void impose_hermitian_symmetry_interleaved(const std::vector<size_t>& length,
-                                           const std::vector<size_t>& ilength,
-                                           const std::vector<size_t>& stride,
-                                           size_t                     dist,
-                                           size_t                     batch,
-                                           std::complex<Tfloat>*      input_data)
-{
-    auto blockSize = DATA_GEN_THREADS;
-
-    switch(length.size())
-    {
-    case 1:
-    {
-        const auto gridDim  = dim3(blockSize);
-        const auto blockDim = dim3(DivRoundingUp<size_t>(batch, blockSize));
-
-        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1<Tfloat>,
-                           gridDim,
-                           blockDim,
-                           0,
-                           0,
-                           input_data,
-                           length[0],
-                           stride[0],
-                           dist,
-                           batch,
-                           length[0] % 2 == 0);
-
-        break;
-    }
-    case 2:
-    {
-        const auto gridDim  = dim3(blockSize, blockSize);
-        const auto blockDim = dim3(DivRoundingUp<size_t>(ilength[0], blockSize),
-                                   DivRoundingUp<size_t>(batch, blockSize));
-
-        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2<Tfloat>,
-                           gridDim,
-                           blockDim,
-                           0,
-                           0,
-                           input_data,
-                           length[1],
-                           length[0],
-                           stride[1],
-                           stride[0],
-                           dist,
-                           batch,
-                           length[1] % 2 == 0,
-                           length[0] % 2 == 0);
-
-        break;
-    }
-    case 3:
-    {
-        const auto gridDim  = dim3(blockSize, blockSize, blockSize);
-        const auto blockDim = dim3(DivRoundingUp<size_t>(ilength[0], blockSize),
-                                   DivRoundingUp<size_t>(ilength[1], blockSize),
-                                   DivRoundingUp<size_t>(batch, blockSize));
-
-        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3<Tfloat>,
-                           gridDim,
-                           blockDim,
-                           0,
-                           0,
-                           input_data,
-                           length[2],
-                           length[0],
-                           length[1],
-                           stride[2],
-                           stride[0],
-                           stride[1],
-                           dist,
-                           batch,
-                           length[2] % 2 == 0,
-                           length[0] % 2 == 0,
-                           length[1] % 2 == 0);
-        break;
-    }
-    default:
-        throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
-    }
-}
-
-template <typename Tfloat>
-void impose_hermitian_symmetry_planar(const std::vector<size_t>& length,
-                                      const std::vector<size_t>& ilength,
-                                      const std::vector<size_t>& stride,
-                                      size_t                     dist,
-                                      size_t                     batch,
-                                      Tfloat*                    input_data_real,
-                                      Tfloat*                    input_data_imag)
-{
-    auto blockSize = DATA_GEN_THREADS;
-
-    switch(length.size())
-    {
-    case 1:
-    {
-        const auto gridDim  = dim3(blockSize);
-        const auto blockDim = dim3(DivRoundingUp<size_t>(batch, blockSize));
-
-        hipLaunchKernelGGL(impose_hermitian_symmetry_planar_1<Tfloat>,
-                           gridDim,
-                           blockDim,
-                           0,
-                           0,
-                           input_data_real,
-                           input_data_imag,
-                           length[0],
-                           stride[0],
-                           dist,
-                           batch,
-                           length[0] % 2 == 0);
-
-        break;
-    }
-    case 2:
-    {
-        const auto gridDim  = dim3(blockSize, blockSize);
-        const auto blockDim = dim3(DivRoundingUp<size_t>(ilength[0], blockSize),
-                                   DivRoundingUp<size_t>(batch, blockSize));
-
-        hipLaunchKernelGGL(impose_hermitian_symmetry_planar_2<Tfloat>,
-                           gridDim,
-                           blockDim,
-                           0,
-                           0,
-                           input_data_real,
-                           input_data_imag,
-                           length[1],
-                           length[0],
-                           stride[1],
-                           stride[0],
-                           dist,
-                           batch,
-                           length[1] % 2 == 0,
-                           length[0] % 2 == 0);
-
-        break;
-    }
-    case 3:
-    {
-        const auto gridDim  = dim3(blockSize, blockSize, blockSize);
-        const auto blockDim = dim3(DivRoundingUp<size_t>(ilength[0], blockSize),
-                                   DivRoundingUp<size_t>(ilength[1], blockSize),
-                                   DivRoundingUp<size_t>(batch, blockSize));
-
-        hipLaunchKernelGGL(impose_hermitian_symmetry_planar_3<Tfloat>,
-                           gridDim,
-                           blockDim,
-                           0,
-                           0,
-                           input_data_real,
-                           input_data_imag,
-                           length[2],
-                           length[0],
-                           length[1],
-                           stride[2],
-                           stride[0],
-                           stride[1],
-                           dist,
-                           batch,
-                           length[2] % 2 == 0,
-                           length[0] % 2 == 0,
-                           length[1] % 2 == 0);
-        break;
-    }
-    default:
-        throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
-    }
-}
-
-#endif // DATA_GEN_H
\ No newline at end of file
diff -Nru rocfft-5.5.0/clients/fft_params.h rocfft-5.7.1/clients/fft_params.h
--- rocfft-5.5.0/clients/fft_params.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/fft_params.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,2730 +0,0 @@
-// Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-#ifndef FFT_PARAMS_H
-#define FFT_PARAMS_H
-
-#include "data_gen.h"
-#include <algorithm>
-#include <complex>
-#include <hip/hip_runtime.h>
-#include <iostream>
-#include <mutex>
-#include <numeric>
-#include <omp.h>
-#include <random>
-#include <tuple>
-#include <unordered_set>
-#include <vector>
-
-#include "../shared/array_validator.h"
-#include "../shared/printbuffer.h"
-#include "../shared/ptrdiff.h"
-
-enum fft_status
-{
-    fft_status_success,
-    fft_status_failure,
-    fft_status_invalid_arg_value,
-    fft_status_invalid_dimensions,
-    fft_status_invalid_array_type,
-    fft_status_invalid_strides,
-    fft_status_invalid_distance,
-    fft_status_invalid_offset,
-    fft_status_invalid_work_buffer,
-};
-
-enum fft_transform_type
-{
-    fft_transform_type_complex_forward,
-    fft_transform_type_complex_inverse,
-    fft_transform_type_real_forward,
-    fft_transform_type_real_inverse,
-};
-
-enum fft_precision
-{
-    fft_precision_single,
-    fft_precision_double,
-};
-
-enum fft_array_type
-{
-    fft_array_type_complex_interleaved,
-    fft_array_type_complex_planar,
-    fft_array_type_real,
-    fft_array_type_hermitian_interleaved,
-    fft_array_type_hermitian_planar,
-    fft_array_type_unset,
-};
-
-enum fft_result_placement
-{
-    fft_placement_inplace,
-    fft_placement_notinplace,
-};
-
-// Determine the size of the data type given the precision and type.
-template <typename Tsize>
-inline Tsize var_size(const fft_precision precision, const fft_array_type type)
-{
-    size_t var_size = 0;
-    switch(precision)
-    {
-    case fft_precision_single:
-        var_size = sizeof(float);
-        break;
-    case fft_precision_double:
-        var_size = sizeof(double);
-        break;
-    }
-    switch(type)
-    {
-    case fft_array_type_complex_interleaved:
-    case fft_array_type_hermitian_interleaved:
-        var_size *= 2;
-        break;
-    default:
-        break;
-    }
-    return var_size;
-}
-
-// Container class for test parameters.
-class fft_params
-{
-public:
-    // All parameters are row-major.
-    std::vector<size_t>  length;
-    std::vector<size_t>  istride;
-    std::vector<size_t>  ostride;
-    size_t               nbatch         = 1;
-    fft_precision        precision      = fft_precision_double;
-    fft_transform_type   transform_type = fft_transform_type_complex_forward;
-    fft_result_placement placement      = fft_placement_inplace;
-    size_t               idist          = 0;
-    size_t               odist          = 0;
-    fft_array_type       itype          = fft_array_type_unset;
-    fft_array_type       otype          = fft_array_type_unset;
-    std::vector<size_t>  ioffset        = {0, 0};
-    std::vector<size_t>  ooffset        = {0, 0};
-
-    std::vector<size_t> isize;
-    std::vector<size_t> osize;
-
-    size_t workbuffersize = 0;
-
-    // run testing load/store callbacks
-    bool                    run_callbacks   = false;
-    static constexpr double load_cb_scalar  = 0.457813941;
-    static constexpr double store_cb_scalar = 0.391504938;
-
-    // Check that data outside of output strides is not overwritten.
-    // This is only set explicitly on some tests where there's space
-    // between dimensions, but the dimensions are still in-order.
-    // We're not trying to generically find holes in arbitrary data
-    // layouts.
-    //
-    // NOTE: this flag is not included in tokens, since it doesn't
-    // affect how the FFT library behaves.
-    bool check_output_strides = false;
-
-    // scaling factor - we do a pointwise multiplication of outputs by
-    // this factor
-    double scale_factor = 1.0;
-
-    fft_params(){};
-    virtual ~fft_params(){};
-
-    // Given an array type, return the name as a string.
-    static std::string array_type_name(const fft_array_type type, bool verbose = true)
-    {
-        switch(type)
-        {
-        case fft_array_type_complex_interleaved:
-            return verbose ? "fft_array_type_complex_interleaved" : "CI";
-        case fft_array_type_complex_planar:
-            return verbose ? "fft_array_type_complex_planar" : "CP";
-        case fft_array_type_real:
-            return verbose ? "fft_array_type_real" : "R";
-        case fft_array_type_hermitian_interleaved:
-            return verbose ? "fft_array_type_hermitian_interleaved" : "HI";
-        case fft_array_type_hermitian_planar:
-            return verbose ? "fft_array_type_hermitian_planar" : "HP";
-        case fft_array_type_unset:
-            return verbose ? "fft_array_type_unset" : "UN";
-        }
-        return "";
-    }
-
-    std::string transform_type_name() const
-    {
-        switch(transform_type)
-        {
-        case fft_transform_type_complex_forward:
-            return "fft_transform_type_complex_forward";
-        case fft_transform_type_complex_inverse:
-            return "fft_transform_type_complex_inverse";
-        case fft_transform_type_real_forward:
-            return "fft_transform_type_real_forward";
-        case fft_transform_type_real_inverse:
-            return "fft_transform_type_real_inverse";
-        default:
-            throw std::runtime_error("Invalid transform type");
-        }
-    }
-
-    // Convert to string for output.
-    std::string str(const std::string& separator = ", ") const
-    {
-        std::stringstream ss;
-        ss << "length:";
-        for(auto i : length)
-            ss << " " << i;
-        ss << separator;
-        ss << "istride:";
-        for(auto i : istride)
-            ss << " " << i;
-        ss << separator;
-        ss << "idist: " << idist << separator;
-
-        ss << "ostride:";
-        for(auto i : ostride)
-            ss << " " << i;
-        ss << separator;
-        ss << "odist: " << odist << separator;
-
-        ss << "batch: " << nbatch << separator;
-        ss << "isize:";
-        for(auto i : isize)
-            ss << " " << i;
-        ss << separator;
-        ss << "osize:";
-        for(auto i : osize)
-            ss << " " << i;
-        ss << separator;
-
-        ss << "ioffset:";
-        for(auto i : ioffset)
-            ss << " " << i;
-        ss << separator;
-        ss << "ooffset:";
-        for(auto i : ooffset)
-            ss << " " << i;
-        ss << separator;
-
-        if(placement == fft_placement_inplace)
-            ss << "in-place";
-        else
-            ss << "out-of-place";
-        ss << separator;
-        ss << "transform_type: " << transform_type_name() << separator;
-        ss << array_type_name(itype) << " -> " << array_type_name(otype) << separator;
-        if(precision == fft_precision_single)
-            ss << "single-precision";
-        else
-            ss << "double-precision";
-        ss << separator;
-
-        ss << "ilength:";
-        for(const auto i : ilength())
-            ss << " " << i;
-        ss << separator;
-        ss << "olength:";
-        for(const auto i : olength())
-            ss << " " << i;
-        ss << separator;
-
-        ss << "ibuffer_size:";
-        for(const auto i : ibuffer_sizes())
-            ss << " " << i;
-        ss << separator;
-
-        ss << "obuffer_size:";
-        for(const auto i : obuffer_sizes())
-            ss << " " << i;
-        ss << separator;
-
-        if(scale_factor != 1.0)
-            ss << "scale factor: " << scale_factor << separator;
-
-        return ss.str();
-    }
-
-    // Produce a stringified token of the test fft params.
-    std::string token() const
-    {
-        std::string ret;
-
-        switch(transform_type)
-        {
-        case fft_transform_type_complex_forward:
-            ret += "complex_forward_";
-            break;
-        case fft_transform_type_complex_inverse:
-            ret += "complex_inverse_";
-            break;
-        case fft_transform_type_real_forward:
-            ret += "real_forward_";
-            break;
-        case fft_transform_type_real_inverse:
-            ret += "real_inverse_";
-            break;
-        }
-
-        ret += "len_";
-
-        for(auto n : length)
-        {
-            ret += std::to_string(n);
-            ret += "_";
-        }
-        switch(precision)
-        {
-        case fft_precision_single:
-            ret += "single_";
-            break;
-        case fft_precision_double:
-            ret += "double_";
-            break;
-        }
-
-        switch(placement)
-        {
-        case fft_placement_inplace:
-            ret += "ip_";
-            break;
-        case fft_placement_notinplace:
-            ret += "op_";
-            break;
-        }
-
-        ret += "batch_";
-        ret += std::to_string(nbatch);
-
-        auto append_array_info = [&ret](const std::vector<size_t>& stride, fft_array_type type) {
-            for(auto s : stride)
-            {
-                ret += std::to_string(s);
-                ret += "_";
-            }
-
-            switch(type)
-            {
-            case fft_array_type_complex_interleaved:
-                ret += "CI";
-                break;
-            case fft_array_type_complex_planar:
-                ret += "CP";
-                break;
-            case fft_array_type_real:
-                ret += "R";
-                break;
-            case fft_array_type_hermitian_interleaved:
-                ret += "HI";
-                break;
-            case fft_array_type_hermitian_planar:
-                ret += "HP";
-                break;
-            default:
-                ret += "UN";
-                break;
-            }
-        };
-
-        ret += "_istride_";
-        append_array_info(istride, itype);
-
-        ret += "_ostride_";
-        append_array_info(ostride, otype);
-
-        ret += "_idist_";
-        ret += std::to_string(idist);
-        ret += "_odist_";
-        ret += std::to_string(odist);
-
-        ret += "_ioffset";
-        for(auto n : ioffset)
-        {
-            ret += "_";
-            ret += std::to_string(n);
-        }
-
-        ret += "_ooffset";
-        for(auto n : ooffset)
-        {
-            ret += "_";
-            ret += std::to_string(n);
-        }
-
-        if(run_callbacks)
-            ret += "_CB";
-
-        if(scale_factor != 1.0)
-            ret += "_scale";
-
-        return ret;
-    }
-
-    // Set all params from a stringified token.
-    void from_token(std::string token)
-    {
-        std::vector<std::string> vals;
-
-        std::string delimiter = "_";
-        {
-            size_t pos = 0;
-            while((pos = token.find(delimiter)) != std::string::npos)
-            {
-                auto val = token.substr(0, pos);
-                vals.push_back(val);
-                token.erase(0, pos + delimiter.length());
-            }
-            vals.push_back(token);
-        }
-
-        auto vector_parser
-            = [](const std::vector<std::string>& vals, const std::string token, size_t& pos) {
-                  if(vals[pos++] != token)
-                      throw std::runtime_error("Unable to parse token");
-                  std::vector<size_t> vec;
-
-                  while(pos < vals.size())
-                  {
-                      if(std::all_of(vals[pos].begin(), vals[pos].end(), ::isdigit))
-                      {
-                          vec.push_back(std::stoull(vals[pos++]));
-                      }
-                      else
-                      {
-                          break;
-                      }
-                  }
-                  return vec;
-              };
-
-        auto type_parser = [](const std::string& val) {
-            if(val == "CI")
-                return fft_array_type_complex_interleaved;
-            else if(val == "CP")
-                return fft_array_type_complex_planar;
-            else if(val == "R")
-                return fft_array_type_real;
-            else if(val == "HI")
-                return fft_array_type_hermitian_interleaved;
-            else if(val == "HP")
-                return fft_array_type_hermitian_planar;
-            return fft_array_type_unset;
-        };
-
-        size_t pos = 0;
-
-        bool complex = vals[pos++] == "complex";
-        bool forward = vals[pos++] == "forward";
-
-        if(complex && forward)
-            transform_type = fft_transform_type_complex_forward;
-        if(complex && !forward)
-            transform_type = fft_transform_type_complex_inverse;
-        if(!complex && forward)
-            transform_type = fft_transform_type_real_forward;
-        if(!complex && !forward)
-            transform_type = fft_transform_type_real_inverse;
-
-        length = vector_parser(vals, "len", pos);
-
-        if(vals[pos] == "single")
-            precision = fft_precision_single;
-        else if(vals[pos] == "double")
-            precision = fft_precision_double;
-        pos++;
-
-        placement = (vals[pos++] == "ip") ? fft_placement_inplace : fft_placement_notinplace;
-
-        if(vals[pos++] != "batch")
-            throw std::runtime_error("Unable to parse token");
-        nbatch = std::stoull(vals[pos++]);
-
-        istride = vector_parser(vals, "istride", pos);
-
-        itype = type_parser(vals[pos]);
-        pos++;
-
-        ostride = vector_parser(vals, "ostride", pos);
-
-        otype = type_parser(vals[pos]);
-        pos++;
-
-        if(vals[pos++] != "idist")
-            throw std::runtime_error("Unable to parse token");
-        idist = std::stoull(vals[pos++]);
-
-        if(vals[pos++] != "odist")
-            throw std::runtime_error("Unable to parse token");
-        odist = std::stoull(vals[pos++]);
-
-        ioffset = vector_parser(vals, "ioffset", pos);
-
-        ooffset = vector_parser(vals, "ooffset", pos);
-
-        if(pos < vals.size() && vals[pos] == "CB")
-        {
-            run_callbacks = true;
-            ++pos;
-        }
-
-        if(pos < vals.size() && vals[pos] == "scale")
-        {
-            // just pick some factor that's not zero or one
-            scale_factor = 0.1239;
-            ++pos;
-        }
-    }
-
-    // Stream output operator (for gtest, etc).
-    friend std::ostream& operator<<(std::ostream& stream, const fft_params& params)
-    {
-        stream << params.str();
-        return stream;
-    }
-
-    // Dimension of the transform.
-    size_t dim() const
-    {
-        return length.size();
-    }
-
-    virtual std::vector<size_t> ilength() const
-    {
-        auto ilength = length;
-        if(transform_type == fft_transform_type_real_inverse)
-            ilength[dim() - 1] = ilength[dim() - 1] / 2 + 1;
-        return ilength;
-    }
-
-    virtual std::vector<size_t> olength() const
-    {
-        auto olength = length;
-        if(transform_type == fft_transform_type_real_forward)
-            olength[dim() - 1] = olength[dim() - 1] / 2 + 1;
-        return olength;
-    }
-
-    static size_t nbuffer(const fft_array_type type)
-    {
-        switch(type)
-        {
-        case fft_array_type_real:
-        case fft_array_type_complex_interleaved:
-        case fft_array_type_hermitian_interleaved:
-            return 1;
-        case fft_array_type_complex_planar:
-        case fft_array_type_hermitian_planar:
-            return 2;
-        case fft_array_type_unset:
-            return 0;
-        }
-        return 0;
-    }
-
-    // Number of input buffers
-    size_t nibuffer() const
-    {
-        return nbuffer(itype);
-    }
-
-    // Number of output buffers
-    size_t nobuffer() const
-    {
-        return nbuffer(otype);
-    }
-
-    void set_iotypes()
-    {
-        if(itype == fft_array_type_unset)
-        {
-            switch(transform_type)
-            {
-            case fft_transform_type_complex_forward:
-            case fft_transform_type_complex_inverse:
-                itype = fft_array_type_complex_interleaved;
-                break;
-            case fft_transform_type_real_forward:
-                itype = fft_array_type_real;
-                break;
-            case fft_transform_type_real_inverse:
-                itype = fft_array_type_hermitian_interleaved;
-                break;
-            default:
-                throw std::runtime_error("Invalid transform type");
-            }
-        }
-        if(otype == fft_array_type_unset)
-        {
-            switch(transform_type)
-            {
-            case fft_transform_type_complex_forward:
-            case fft_transform_type_complex_inverse:
-                otype = fft_array_type_complex_interleaved;
-                break;
-            case fft_transform_type_real_forward:
-                otype = fft_array_type_hermitian_interleaved;
-                break;
-            case fft_transform_type_real_inverse:
-                otype = fft_array_type_real;
-                break;
-            default:
-                throw std::runtime_error("Invalid transform type");
-            }
-        }
-    }
-
-    // Check that the input and output types are consistent.
-    bool check_iotypes() const
-    {
-        switch(itype)
-        {
-        case fft_array_type_complex_interleaved:
-        case fft_array_type_complex_planar:
-        case fft_array_type_hermitian_interleaved:
-        case fft_array_type_hermitian_planar:
-        case fft_array_type_real:
-            break;
-        default:
-            throw std::runtime_error("Invalid Input array type format");
-        }
-
-        switch(otype)
-        {
-        case fft_array_type_complex_interleaved:
-        case fft_array_type_complex_planar:
-        case fft_array_type_hermitian_interleaved:
-        case fft_array_type_hermitian_planar:
-        case fft_array_type_real:
-            break;
-        default:
-            throw std::runtime_error("Invalid Input array type format");
-        }
-
-        // Check that format choices are supported
-        if(transform_type != fft_transform_type_real_forward
-           && transform_type != fft_transform_type_real_inverse)
-        {
-            if(placement == fft_placement_inplace && itype != otype)
-            {
-                throw std::runtime_error(
-                    "In-place transforms must have identical input and output types");
-            }
-        }
-
-        bool okformat = true;
-        switch(itype)
-        {
-        case fft_array_type_complex_interleaved:
-        case fft_array_type_complex_planar:
-            okformat = (otype == fft_array_type_complex_interleaved
-                        || otype == fft_array_type_complex_planar);
-            break;
-        case fft_array_type_hermitian_interleaved:
-        case fft_array_type_hermitian_planar:
-            okformat = otype == fft_array_type_real;
-            break;
-        case fft_array_type_real:
-            okformat = (otype == fft_array_type_hermitian_interleaved
-                        || otype == fft_array_type_hermitian_planar);
-            break;
-        default:
-            throw std::runtime_error("Invalid Input array type format");
-        }
-
-        return okformat;
-    }
-
-    // Given a length vector, set the rest of the strides.
-    // The optional argument stride0 sets the stride for the contiguous dimension.
-    // The optional rcpadding argument sets the stride correctly for in-place
-    // multi-dimensional real/complex transforms.
-    // Format is row-major.
-    template <typename T1>
-    std::vector<T1> compute_stride(const std::vector<T1>&     length,
-                                   const std::vector<size_t>& stride0   = std::vector<size_t>(),
-                                   const bool                 rcpadding = false) const
-    {
-        std::vector<T1> stride(dim());
-
-        size_t dimoffset = 0;
-
-        if(stride0.size() == 0)
-        {
-            // Set the contiguous stride:
-            stride[dim() - 1] = 1;
-            dimoffset         = 1;
-        }
-        else
-        {
-            // Copy the input values to the end of the stride array:
-            for(size_t i = 0; i < stride0.size(); ++i)
-            {
-                stride[dim() - stride0.size() + i] = stride0[i];
-            }
-        }
-
-        if(stride0.size() < dim())
-        {
-            // Compute any remaining values via recursion.
-            for(size_t i = dim() - dimoffset - stride0.size(); i-- > 0;)
-            {
-                auto lengthip1 = length[i + 1];
-                if(rcpadding && i == dim() - 2)
-                {
-                    lengthip1 = 2 * (lengthip1 / 2 + 1);
-                }
-                stride[i] = stride[i + 1] * lengthip1;
-            }
-        }
-
-        return stride;
-    }
-
-    void compute_istride()
-    {
-        istride = compute_stride(ilength(),
-                                 istride,
-                                 placement == fft_placement_inplace
-                                     && transform_type == fft_transform_type_real_forward);
-    }
-
-    void compute_ostride()
-    {
-        ostride = compute_stride(olength(),
-                                 ostride,
-                                 placement == fft_placement_inplace
-                                     && transform_type == fft_transform_type_real_inverse);
-    }
-
-    virtual void compute_isize()
-    {
-        auto   il  = ilength();
-        size_t val = compute_ptrdiff(il, istride, nbatch, idist);
-        isize.resize(nibuffer());
-        for(unsigned int i = 0; i < isize.size(); ++i)
-        {
-            isize[i] = val + ioffset[i];
-        }
-    }
-
-    virtual void compute_osize()
-    {
-        auto   ol  = olength();
-        size_t val = compute_ptrdiff(ol, ostride, nbatch, odist);
-        osize.resize(nobuffer());
-        for(unsigned int i = 0; i < osize.size(); ++i)
-        {
-            osize[i] = val + ooffset[i];
-        }
-    }
-
-    std::vector<size_t> ibuffer_sizes() const
-    {
-        std::vector<size_t> ibuffer_sizes;
-
-        // In-place real-to-complex transforms need to have enough space in the input buffer to
-        // accomadate the output, which is slightly larger.
-        if(placement == fft_placement_inplace && transform_type == fft_transform_type_real_forward)
-        {
-            return obuffer_sizes();
-        }
-
-        if(isize.empty())
-            return ibuffer_sizes;
-
-        switch(itype)
-        {
-        case fft_array_type_complex_planar:
-        case fft_array_type_hermitian_planar:
-            ibuffer_sizes.resize(2);
-            break;
-        default:
-            ibuffer_sizes.resize(1);
-        }
-        for(unsigned i = 0; i < ibuffer_sizes.size(); i++)
-        {
-            ibuffer_sizes[i] = isize[i] * var_size<size_t>(precision, itype);
-        }
-        return ibuffer_sizes;
-    }
-
-    virtual std::vector<size_t> obuffer_sizes() const
-    {
-        std::vector<size_t> obuffer_sizes;
-
-        if(osize.empty())
-            return obuffer_sizes;
-
-        switch(otype)
-        {
-        case fft_array_type_complex_planar:
-        case fft_array_type_hermitian_planar:
-            obuffer_sizes.resize(2);
-            break;
-        default:
-            obuffer_sizes.resize(1);
-        }
-        for(unsigned i = 0; i < obuffer_sizes.size(); i++)
-        {
-            obuffer_sizes[i] = osize[i] * var_size<size_t>(precision, otype);
-        }
-        return obuffer_sizes;
-    }
-
-    // Compute the idist for a given transform based on the placeness, transform type, and data
-    // layout.
-    void set_idist()
-    {
-        if(idist != 0)
-            return;
-
-        // In-place 1D transforms need extra dist.
-        if(transform_type == fft_transform_type_real_forward && dim() == 1
-           && placement == fft_placement_inplace)
-        {
-            idist = 2 * (length[0] / 2 + 1) * istride[0];
-            return;
-        }
-
-        if(transform_type == fft_transform_type_real_inverse && dim() == 1)
-        {
-            idist = (length[0] / 2 + 1) * istride[0];
-            return;
-        }
-
-        idist = (transform_type == fft_transform_type_real_inverse)
-                    ? (length[dim() - 1] / 2 + 1) * istride[dim() - 1]
-                    : length[dim() - 1] * istride[dim() - 1];
-        for(unsigned int i = 0; i < dim() - 1; ++i)
-        {
-            idist = std::max(length[i] * istride[i], idist);
-        }
-    }
-
-    // Compute the odist for a given transform based on the placeness, transform type, and data
-    // layout.  Row-major.
-    void set_odist()
-    {
-        if(odist != 0)
-            return;
-
-        // In-place 1D transforms need extra dist.
-        if(transform_type == fft_transform_type_real_inverse && dim() == 1
-           && placement == fft_placement_inplace)
-        {
-            odist = 2 * (length[0] / 2 + 1) * ostride[0];
-            return;
-        }
-
-        if(transform_type == fft_transform_type_real_forward && dim() == 1)
-        {
-            odist = (length[0] / 2 + 1) * ostride[0];
-            return;
-        }
-
-        odist = (transform_type == fft_transform_type_real_forward)
-                    ? (length[dim() - 1] / 2 + 1) * ostride[dim() - 1]
-                    : length[dim() - 1] * ostride[dim() - 1];
-        for(unsigned int i = 0; i < dim() - 1; ++i)
-        {
-            odist = std::max(length[i] * ostride[i], odist);
-        }
-    }
-
-    // Put the length, stride, batch, and dist into a single length/stride array and pass off to the
-    // validity checker.
-    bool valid_length_stride_batch_dist(const std::vector<size_t>& l0,
-                                        const std::vector<size_t>& s0,
-                                        const size_t               n,
-                                        const size_t               dist,
-                                        const int                  verbose = 0) const
-    {
-        if(l0.size() != s0.size())
-            return false;
-
-        // Length and stride vectors, including bathes:
-        std::vector<size_t> l{}, s{};
-        for(unsigned int i = 0; i < l0.size(); ++i)
-        {
-            if(l0[i] > 1)
-            {
-                if(s0[i] == 0)
-                    return false;
-                l.push_back(l0[i]);
-                s.push_back(s0[i]);
-            }
-        }
-        if(n > 1)
-        {
-            if(dist == 0)
-                return false;
-            l.push_back(n);
-            s.push_back(dist);
-        }
-
-        return array_valid(l, s, verbose);
-    }
-
-    // Return true if the given GPU parameters would produce a valid transform.
-    bool valid(const int verbose) const
-    {
-        if(ioffset.size() < nibuffer() || ooffset.size() < nobuffer())
-            return false;
-
-        // Check that in-place transforms have the same input and output stride:
-        if(placement == fft_placement_inplace)
-        {
-            const auto stridesize = std::min(istride.size(), ostride.size());
-            bool       samestride = true;
-            for(unsigned int i = 0; i < stridesize; ++i)
-            {
-                if(istride[i] != ostride[i])
-                    samestride = false;
-            }
-            if((transform_type == fft_transform_type_complex_forward
-                || transform_type == fft_transform_type_complex_inverse)
-               && !samestride)
-            {
-                // In-place transforms require identical input and output strides.
-                if(verbose)
-                {
-                    std::cout << "istride:";
-                    for(const auto& i : istride)
-                        std::cout << " " << i;
-                    std::cout << " ostride0:";
-                    for(const auto& i : ostride)
-                        std::cout << " " << i;
-                    std::cout << " differ; skipped for in-place transforms: skipping test"
-                              << std::endl;
-                }
-                return false;
-            }
-
-            if((transform_type == fft_transform_type_complex_forward
-                || transform_type == fft_transform_type_complex_inverse)
-               && (idist != odist))
-            {
-                // In-place transforms require identical distance
-                if(verbose)
-                {
-                    std::cout << "idist:" << idist << " odist:" << odist
-                              << " differ; skipped for in-place transforms: skipping test"
-                              << std::endl;
-                }
-                return false;
-            }
-
-            if((transform_type == fft_transform_type_real_forward
-                || transform_type == fft_transform_type_real_inverse)
-               && (istride.back() != 1 || ostride.back() != 1))
-            {
-                // In-place real/complex transforms require unit strides.
-                if(verbose)
-                {
-                    std::cout
-                        << "istride.back(): " << istride.back()
-                        << " ostride.back(): " << ostride.back()
-                        << " must be unitary for in-place real/complex transforms: skipping test"
-                        << std::endl;
-                }
-                return false;
-            }
-
-            if((itype == fft_array_type_complex_interleaved
-                && otype == fft_array_type_complex_planar)
-               || (itype == fft_array_type_complex_planar
-                   && otype == fft_array_type_complex_interleaved))
-            {
-                if(verbose)
-                {
-                    std::cout << "In-place c2c transforms require identical io types; skipped.\n";
-                }
-                return false;
-            }
-
-            // Check offsets
-            switch(transform_type)
-            {
-            case fft_transform_type_complex_forward:
-            case fft_transform_type_complex_inverse:
-                for(unsigned int i = 0; i < nibuffer(); ++i)
-                {
-                    if(ioffset[i] != ooffset[i])
-                        return false;
-                }
-                break;
-            case fft_transform_type_real_forward:
-                if(ioffset[0] != 2 * ooffset[0])
-                    return false;
-                break;
-            case fft_transform_type_real_inverse:
-                if(2 * ioffset[0] != ooffset[0])
-                    return false;
-                break;
-            }
-        }
-
-        if(!check_iotypes())
-            return false;
-
-        // we can only check output strides on out-of-place
-        // transforms, since we need to initialize output to a known
-        // pattern
-        if(placement == fft_placement_inplace && check_output_strides)
-            return false;
-
-        // Check input and output strides
-        if(valid_length_stride_batch_dist(ilength(), istride, nbatch, idist, verbose) != true)
-        {
-            if(verbose)
-                std::cout << "Invalid input data format.\n";
-            return false;
-        }
-        if(!(ilength() == olength() && istride == ostride && idist == odist))
-        {
-            // Only check if different
-            if(valid_length_stride_batch_dist(olength(), ostride, nbatch, odist, verbose) != true)
-            {
-                if(verbose)
-                    std::cout << "Invalid output data format.\n";
-                return false;
-            }
-        }
-
-        // The parameters are valid.
-        return true;
-    }
-
-    // Fill in any missing parameters.
-    void validate()
-    {
-        set_iotypes();
-        compute_istride();
-        compute_ostride();
-        set_idist();
-        set_odist();
-        compute_isize();
-        compute_osize();
-    }
-
-    // Column-major getters:
-    std::vector<size_t> length_cm() const
-    {
-        auto length_cm = length;
-        std::reverse(std::begin(length_cm), std::end(length_cm));
-        return length_cm;
-    }
-    std::vector<size_t> ilength_cm() const
-    {
-        auto ilength_cm = ilength();
-        std::reverse(std::begin(ilength_cm), std::end(ilength_cm));
-        return ilength_cm;
-    }
-    std::vector<size_t> olength_cm() const
-    {
-        auto olength_cm = olength();
-        std::reverse(std::begin(olength_cm), std::end(olength_cm));
-        return olength_cm;
-    }
-    std::vector<size_t> istride_cm() const
-    {
-        auto istride_cm = istride;
-        std::reverse(std::begin(istride_cm), std::end(istride_cm));
-        return istride_cm;
-    }
-    std::vector<size_t> ostride_cm() const
-    {
-        auto ostride_cm = ostride;
-        std::reverse(std::begin(ostride_cm), std::end(ostride_cm));
-        return ostride_cm;
-    }
-
-    template <typename Tallocator, typename Tstream = std::ostream>
-    void print_ibuffer(const std::vector<std::vector<char, Tallocator>>& buf,
-                       Tstream&                                          stream = std::cout) const
-    {
-        switch(itype)
-        {
-        case fft_array_type_complex_interleaved:
-        case fft_array_type_hermitian_interleaved:
-        {
-            switch(precision)
-            {
-            case fft_precision_single:
-            {
-                buffer_printer<std::complex<float>> s;
-                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
-                break;
-            }
-            case fft_precision_double:
-            {
-                buffer_printer<std::complex<double>> s;
-                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
-                break;
-            }
-            }
-            break;
-        }
-        case fft_array_type_complex_planar:
-        case fft_array_type_hermitian_planar:
-        case fft_array_type_real:
-        {
-            switch(precision)
-            {
-            case fft_precision_single:
-            {
-                buffer_printer<float> s;
-                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
-                break;
-            }
-            case fft_precision_double:
-            {
-                buffer_printer<double> s;
-                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
-                break;
-            }
-            }
-            break;
-        }
-        default:
-            throw std::runtime_error("Invalid itype in print_ibuffer");
-        }
-    }
-
-    template <typename Tallocator, typename Tstream = std::ostream>
-    void print_obuffer(const std::vector<std::vector<char, Tallocator>>& buf,
-                       Tstream&                                          stream = std::cout) const
-    {
-        switch(otype)
-        {
-        case fft_array_type_complex_interleaved:
-        case fft_array_type_hermitian_interleaved:
-        {
-            switch(precision)
-            {
-            case fft_precision_single:
-            {
-                buffer_printer<std::complex<float>> s;
-                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
-                break;
-            }
-            case fft_precision_double:
-                buffer_printer<std::complex<double>> s;
-                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
-                break;
-            }
-            break;
-        }
-        case fft_array_type_complex_planar:
-        case fft_array_type_hermitian_planar:
-        case fft_array_type_real:
-        {
-            switch(precision)
-            {
-            case fft_precision_single:
-            {
-                buffer_printer<float> s;
-                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
-                break;
-            }
-            case fft_precision_double:
-            {
-                buffer_printer<double> s;
-                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
-                break;
-            }
-            }
-            break;
-        }
-
-        default:
-            throw std::runtime_error("Invalid itype in print_obuffer");
-        }
-    }
-
-    template <typename Tallocator>
-    void print_ibuffer_flat(const std::vector<std::vector<char, Tallocator>>& buf) const
-    {
-        switch(itype)
-        {
-        case fft_array_type_complex_interleaved:
-        case fft_array_type_hermitian_interleaved:
-        {
-            switch(precision)
-            {
-            case fft_precision_single:
-            {
-                buffer_printer<std::complex<float>> s;
-                s.print_buffer_flat(buf, osize, ooffset);
-                break;
-            }
-            case fft_precision_double:
-                buffer_printer<std::complex<double>> s;
-                s.print_buffer_flat(buf, osize, ooffset);
-                break;
-            }
-            break;
-        }
-        case fft_array_type_complex_planar:
-        case fft_array_type_hermitian_planar:
-        case fft_array_type_real:
-        {
-            switch(precision)
-            {
-            case fft_precision_single:
-            {
-                buffer_printer<float> s;
-                s.print_buffer_flat(buf, osize, ooffset);
-                break;
-            }
-            case fft_precision_double:
-            {
-                buffer_printer<double> s;
-                s.print_buffer_flat(buf, osize, ooffset);
-                break;
-            }
-            }
-            break;
-        default:
-            throw std::runtime_error("Invalid itype in print_ibuffer_flat");
-        }
-        }
-    }
-
-    template <typename Tallocator>
-    void print_obuffer_flat(const std::vector<std::vector<char, Tallocator>>& buf) const
-    {
-        switch(otype)
-        {
-        case fft_array_type_complex_interleaved:
-        case fft_array_type_hermitian_interleaved:
-        {
-            switch(precision)
-            {
-            case fft_precision_single:
-            {
-                buffer_printer<std::complex<float>> s;
-                s.print_buffer_flat(buf, osize, ooffset);
-                break;
-            }
-            case fft_precision_double:
-                buffer_printer<std::complex<double>> s;
-                s.print_buffer_flat(buf, osize, ooffset);
-                break;
-            }
-            break;
-        }
-        case fft_array_type_complex_planar:
-        case fft_array_type_hermitian_planar:
-        case fft_array_type_real:
-        {
-            switch(precision)
-            {
-            case fft_precision_single:
-            {
-                buffer_printer<float> s;
-                s.print_buffer_flat(buf, osize, ooffset);
-                break;
-            }
-
-            case fft_precision_double:
-            {
-                buffer_printer<double> s;
-                s.print_buffer_flat(buf, osize, ooffset);
-                break;
-            }
-            }
-            break;
-        default:
-            throw std::runtime_error("Invalid itype in print_ibuffer_flat");
-        }
-        }
-    }
-
-    virtual fft_status set_callbacks(void* load_cb_host,
-                                     void* load_cb_data,
-                                     void* store_cb_host,
-                                     void* store_cb_data)
-    {
-        return fft_status_success;
-    }
-
-    virtual fft_status execute(void** in, void** out)
-    {
-        return fft_status_success;
-    };
-
-    size_t fft_params_vram_footprint()
-    {
-        return fft_params::vram_footprint();
-    }
-
-    virtual size_t vram_footprint()
-    {
-        const auto ibuf_size = ibuffer_sizes();
-        size_t     val       = std::accumulate(ibuf_size.begin(), ibuf_size.end(), (size_t)1);
-        if(placement == fft_placement_notinplace)
-        {
-            const auto obuf_size = obuffer_sizes();
-            val += std::accumulate(obuf_size.begin(), obuf_size.end(), (size_t)1);
-        }
-        return val;
-    }
-
-    // Specific exception type for work buffer allocation failure.
-    // Tests that hit this can't fit on the GPU and should be skipped.
-    struct work_buffer_alloc_failure : public std::runtime_error
-    {
-        work_buffer_alloc_failure(const std::string& s)
-            : std::runtime_error(s)
-        {
-        }
-    };
-
-    virtual fft_status create_plan()
-    {
-        return fft_status_success;
-    }
-};
-
-// This is used with the program_options class so that the user can type an integer on the
-// command line and we store into an enum varaible
-template <typename _Elem, typename _Traits>
-std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream,
-                                               fft_array_type&                     atype)
-{
-    unsigned tmp;
-    stream >> tmp;
-    atype = fft_array_type(tmp);
-    return stream;
-}
-
-// similarly for transform type
-template <typename _Elem, typename _Traits>
-std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream,
-                                               fft_transform_type&                 ttype)
-{
-    unsigned tmp;
-    stream >> tmp;
-    ttype = fft_transform_type(tmp);
-    return stream;
-}
-
-// count the number of total iterations for 1-, 2-, and 3-D dimensions
-template <typename T1>
-size_t count_iters(const T1& i)
-{
-    return i;
-}
-
-template <typename T1>
-size_t count_iters(const std::tuple<T1, T1>& i)
-{
-    return std::get<0>(i) * std::get<1>(i);
-}
-
-template <typename T1>
-size_t count_iters(const std::tuple<T1, T1, T1>& i)
-{
-    return std::get<0>(i) * std::get<1>(i) * std::get<2>(i);
-}
-
-// Work out how many partitions to break our iteration problem into
-template <typename T1>
-static size_t compute_partition_count(T1 length)
-{
-#ifdef BUILD_CLIENTS_TESTS_OPENMP
-    // we seem to get contention from too many threads, which slows
-    // things down.  particularly noticeable with mix_3D tests
-    static const size_t MAX_PARTITIONS = 8;
-    size_t              iters          = count_iters(length);
-    size_t hw_threads = std::min(MAX_PARTITIONS, static_cast<size_t>(omp_get_num_procs()));
-    if(!hw_threads)
-        return 1;
-
-    // don't bother threading problem sizes that are too small. pick
-    // an arbitrary number of iterations and ensure that each thread
-    // has at least that many iterations to process
-    static const size_t MIN_ITERS_PER_THREAD = 2048;
-
-    // either use the whole CPU, or use ceil(iters/iters_per_thread)
-    return std::min(hw_threads, (iters + MIN_ITERS_PER_THREAD + 1) / MIN_ITERS_PER_THREAD);
-#else
-    return 1;
-#endif
-}
-
-// Break a scalar length into some number of pieces, returning
-// [(start0, end0), (start1, end1), ...]
-template <typename T1>
-std::vector<std::pair<T1, T1>> partition_base(const T1& length, size_t num_parts)
-{
-    static_assert(std::is_integral<T1>::value, "Integral required.");
-
-    // make sure we don't exceed the length
-    num_parts = std::min(length, num_parts);
-
-    std::vector<std::pair<T1, T1>> ret(num_parts);
-    auto                           partition_size = length / num_parts;
-    T1                             cur_partition  = 0;
-    for(size_t i = 0; i < num_parts; ++i, cur_partition += partition_size)
-    {
-        ret[i].first  = cur_partition;
-        ret[i].second = cur_partition + partition_size;
-    }
-    // last partition might not divide evenly, fix it up
-    ret.back().second = length;
-    return ret;
-}
-
-// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths
-template <typename T1>
-std::vector<std::pair<T1, T1>> partition_rowmajor(const T1& length)
-{
-    return partition_base(length, compute_partition_count(length));
-}
-
-// Partition on the leftmost part of the tuple, for row-major indexing
-template <typename T1>
-std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>>
-    partition_rowmajor(const std::tuple<T1, T1>& length)
-{
-    auto partitions = partition_base(std::get<0>(length), compute_partition_count(length));
-    std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>> ret(partitions.size());
-    for(size_t i = 0; i < partitions.size(); ++i)
-    {
-        std::get<0>(ret[i].first)  = partitions[i].first;
-        std::get<1>(ret[i].first)  = 0;
-        std::get<0>(ret[i].second) = partitions[i].second;
-        std::get<1>(ret[i].second) = std::get<1>(length);
-    }
-    return ret;
-}
-template <typename T1>
-std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>>
-    partition_rowmajor(const std::tuple<T1, T1, T1>& length)
-{
-    auto partitions = partition_base(std::get<0>(length), compute_partition_count(length));
-    std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>> ret(partitions.size());
-    for(size_t i = 0; i < partitions.size(); ++i)
-    {
-        std::get<0>(ret[i].first)  = partitions[i].first;
-        std::get<1>(ret[i].first)  = 0;
-        std::get<2>(ret[i].first)  = 0;
-        std::get<0>(ret[i].second) = partitions[i].second;
-        std::get<1>(ret[i].second) = std::get<1>(length);
-        std::get<2>(ret[i].second) = std::get<2>(length);
-    }
-    return ret;
-}
-
-// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths
-template <typename T1>
-std::vector<std::pair<T1, T1>> partition_colmajor(const T1& length)
-{
-    return partition_base(length, compute_partition_count(length));
-}
-
-// Partition on the rightmost part of the tuple, for col-major indexing
-template <typename T1>
-std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>>
-    partition_colmajor(const std::tuple<T1, T1>& length)
-{
-    auto partitions = partition_base(std::get<1>(length), compute_partition_count(length));
-    std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>> ret(partitions.size());
-    for(size_t i = 0; i < partitions.size(); ++i)
-    {
-        std::get<1>(ret[i].first)  = partitions[i].first;
-        std::get<0>(ret[i].first)  = 0;
-        std::get<1>(ret[i].second) = partitions[i].second;
-        std::get<0>(ret[i].second) = std::get<0>(length);
-    }
-    return ret;
-}
-template <typename T1>
-std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>>
-    partition_colmajor(const std::tuple<T1, T1, T1>& length)
-{
-    auto partitions = partition_base(std::get<2>(length), compute_partition_count(length));
-    std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>> ret(partitions.size());
-    for(size_t i = 0; i < partitions.size(); ++i)
-    {
-        std::get<2>(ret[i].first)  = partitions[i].first;
-        std::get<1>(ret[i].first)  = 0;
-        std::get<0>(ret[i].first)  = 0;
-        std::get<2>(ret[i].second) = partitions[i].second;
-        std::get<1>(ret[i].second) = std::get<1>(length);
-        std::get<0>(ret[i].second) = std::get<0>(length);
-    }
-    return ret;
-}
-
-// Specialized computation of index given 1-, 2-, 3- dimension length + stride
-template <typename T1, typename T2>
-size_t compute_index(T1 length, T2 stride, size_t base)
-{
-    return (length * stride) + base;
-}
-
-template <typename T1, typename T2>
-size_t
-    compute_index(const std::tuple<T1, T1>& length, const std::tuple<T2, T2>& stride, size_t base)
-{
-    static_assert(std::is_integral<T1>::value, "Integral required.");
-    static_assert(std::is_integral<T2>::value, "Integral required.");
-    return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride))
-           + base;
-}
-
-template <typename T1, typename T2>
-size_t compute_index(const std::tuple<T1, T1, T1>& length,
-                     const std::tuple<T2, T2, T2>& stride,
-                     size_t                        base)
-{
-    static_assert(std::is_integral<T1>::value, "Integral required.");
-    static_assert(std::is_integral<T2>::value, "Integral required.");
-    return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride))
-           + (std::get<2>(length) * std::get<2>(stride)) + base;
-}
-
-// Copy data of dimensions length with strides istride and length idist between batches to
-// a buffer with strides ostride and length odist between batches.  The input and output
-// types are identical.
-template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
-inline void copy_buffers_1to1(const Tval*                input,
-                              Tval*                      output,
-                              const Tint1&               whole_length,
-                              const size_t               nbatch,
-                              const Tint2&               istride,
-                              const size_t               idist,
-                              const Tint3&               ostride,
-                              const size_t               odist,
-                              const std::vector<size_t>& ioffset,
-                              const std::vector<size_t>& ooffset)
-{
-    const bool idx_equals_odx = istride == ostride && idist == odist;
-    size_t     idx_base       = 0;
-    size_t     odx_base       = 0;
-    auto       partitions     = partition_rowmajor(whole_length);
-    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
-    {
-#pragma omp parallel for num_threads(partitions.size())
-        for(size_t part = 0; part < partitions.size(); ++part)
-        {
-            auto       index  = partitions[part].first;
-            const auto length = partitions[part].second;
-            do
-            {
-                const auto idx = compute_index(index, istride, idx_base);
-                const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
-                output[odx + ooffset[0]] = input[idx + ioffset[0]];
-            } while(increment_rowmajor(index, length));
-        }
-    }
-}
-
-// Copy data of dimensions length with strides istride and length idist between batches to
-// a buffer with strides ostride and length odist between batches.  The input type is
-// planar and the output type is complex interleaved.
-template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
-inline void copy_buffers_2to1(const Tval*                input0,
-                              const Tval*                input1,
-                              std::complex<Tval>*        output,
-                              const Tint1&               whole_length,
-                              const size_t               nbatch,
-                              const Tint2&               istride,
-                              const size_t               idist,
-                              const Tint3&               ostride,
-                              const size_t               odist,
-                              const std::vector<size_t>& ioffset,
-                              const std::vector<size_t>& ooffset)
-{
-    const bool idx_equals_odx = istride == ostride && idist == odist;
-    size_t     idx_base       = 0;
-    size_t     odx_base       = 0;
-    auto       partitions     = partition_rowmajor(whole_length);
-    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
-    {
-#pragma omp parallel for num_threads(partitions.size())
-        for(size_t part = 0; part < partitions.size(); ++part)
-        {
-            auto       index  = partitions[part].first;
-            const auto length = partitions[part].second;
-            do
-            {
-                const auto idx = compute_index(index, istride, idx_base);
-                const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
-                output[odx + ooffset[0]]
-                    = std::complex<Tval>(input0[idx + ioffset[0]], input1[idx + ioffset[1]]);
-            } while(increment_rowmajor(index, length));
-        }
-    }
-}
-
-// Copy data of dimensions length with strides istride and length idist between batches to
-// a buffer with strides ostride and length odist between batches.  The input type is
-// complex interleaved and the output type is planar.
-template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
-inline void copy_buffers_1to2(const std::complex<Tval>*  input,
-                              Tval*                      output0,
-                              Tval*                      output1,
-                              const Tint1&               whole_length,
-                              const size_t               nbatch,
-                              const Tint2&               istride,
-                              const size_t               idist,
-                              const Tint3&               ostride,
-                              const size_t               odist,
-                              const std::vector<size_t>& ioffset,
-                              const std::vector<size_t>& ooffset)
-{
-    const bool idx_equals_odx = istride == ostride && idist == odist;
-    size_t     idx_base       = 0;
-    size_t     odx_base       = 0;
-    auto       partitions     = partition_rowmajor(whole_length);
-    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
-    {
-#pragma omp parallel for num_threads(partitions.size())
-        for(size_t part = 0; part < partitions.size(); ++part)
-        {
-            auto       index  = partitions[part].first;
-            const auto length = partitions[part].second;
-            do
-            {
-                const auto idx = compute_index(index, istride, idx_base);
-                const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
-                output0[odx + ooffset[0]] = input[idx + ioffset[0]].real();
-                output1[odx + ooffset[1]] = input[idx + ioffset[0]].imag();
-            } while(increment_rowmajor(index, length));
-        }
-    }
-}
-
-// Copy data of dimensions length with strides istride and length idist between batches to
-// a buffer with strides ostride and length odist between batches.  The input type given
-// by itype, and the output type is given by otype.
-template <typename Tallocator1,
-          typename Tallocator2,
-          typename Tint1,
-          typename Tint2,
-          typename Tint3>
-inline void copy_buffers(const std::vector<std::vector<char, Tallocator1>>& input,
-                         std::vector<std::vector<char, Tallocator2>>&       output,
-                         const Tint1&                                       length,
-                         const size_t                                       nbatch,
-                         const fft_precision                                precision,
-                         const fft_array_type                               itype,
-                         const Tint2&                                       istride,
-                         const size_t                                       idist,
-                         const fft_array_type                               otype,
-                         const Tint3&                                       ostride,
-                         const size_t                                       odist,
-                         const std::vector<size_t>&                         ioffset,
-                         const std::vector<size_t>&                         ooffset)
-{
-    if(itype == otype)
-    {
-        switch(itype)
-        {
-        case fft_array_type_complex_interleaved:
-        case fft_array_type_hermitian_interleaved:
-            switch(precision)
-            {
-            case fft_precision_single:
-                copy_buffers_1to1(reinterpret_cast<const std::complex<float>*>(input[0].data()),
-                                  reinterpret_cast<std::complex<float>*>(output[0].data()),
-                                  length,
-                                  nbatch,
-                                  istride,
-                                  idist,
-                                  ostride,
-                                  odist,
-                                  ioffset,
-                                  ooffset);
-                break;
-            case fft_precision_double:
-                copy_buffers_1to1(reinterpret_cast<const std::complex<double>*>(input[0].data()),
-                                  reinterpret_cast<std::complex<double>*>(output[0].data()),
-                                  length,
-                                  nbatch,
-                                  istride,
-                                  idist,
-                                  ostride,
-                                  odist,
-                                  ioffset,
-                                  ooffset);
-                break;
-            }
-            break;
-        case fft_array_type_real:
-        case fft_array_type_complex_planar:
-        case fft_array_type_hermitian_planar:
-            for(unsigned int idx = 0; idx < input.size(); ++idx)
-            {
-                switch(precision)
-                {
-                case fft_precision_single:
-                    copy_buffers_1to1(reinterpret_cast<const float*>(input[idx].data()),
-                                      reinterpret_cast<float*>(output[idx].data()),
-                                      length,
-                                      nbatch,
-                                      istride,
-                                      idist,
-                                      ostride,
-                                      odist,
-                                      ioffset,
-                                      ooffset);
-                    break;
-                case fft_precision_double:
-                    copy_buffers_1to1(reinterpret_cast<const double*>(input[idx].data()),
-                                      reinterpret_cast<double*>(output[idx].data()),
-                                      length,
-                                      nbatch,
-                                      istride,
-                                      idist,
-                                      ostride,
-                                      odist,
-                                      ioffset,
-                                      ooffset);
-                    break;
-                }
-            }
-            break;
-        default:
-            throw std::runtime_error("Invalid data type");
-        }
-    }
-    else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar)
-            || (itype == fft_array_type_hermitian_interleaved
-                && otype == fft_array_type_hermitian_planar))
-    {
-        // copy 1to2
-        switch(precision)
-        {
-        case fft_precision_single:
-            copy_buffers_1to2(reinterpret_cast<const std::complex<float>*>(input[0].data()),
-                              reinterpret_cast<float*>(output[0].data()),
-                              reinterpret_cast<float*>(output[1].data()),
-                              length,
-                              nbatch,
-                              istride,
-                              idist,
-                              ostride,
-                              odist,
-                              ioffset,
-                              ooffset);
-            break;
-        case fft_precision_double:
-            copy_buffers_1to2(reinterpret_cast<const std::complex<double>*>(input[0].data()),
-                              reinterpret_cast<double*>(output[0].data()),
-                              reinterpret_cast<double*>(output[1].data()),
-                              length,
-                              nbatch,
-                              istride,
-                              idist,
-                              ostride,
-                              odist,
-                              ioffset,
-                              ooffset);
-            break;
-        }
-    }
-    else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved)
-            || (itype == fft_array_type_hermitian_planar
-                && otype == fft_array_type_hermitian_interleaved))
-    {
-        // copy 2 to 1
-        switch(precision)
-        {
-        case fft_precision_single:
-            copy_buffers_2to1(reinterpret_cast<const float*>(input[0].data()),
-                              reinterpret_cast<const float*>(input[1].data()),
-                              reinterpret_cast<std::complex<float>*>(output[0].data()),
-                              length,
-                              nbatch,
-                              istride,
-                              idist,
-                              ostride,
-                              odist,
-                              ioffset,
-                              ooffset);
-            break;
-        case fft_precision_double:
-            copy_buffers_2to1(reinterpret_cast<const double*>(input[0].data()),
-                              reinterpret_cast<const double*>(input[1].data()),
-                              reinterpret_cast<std::complex<double>*>(output[0].data()),
-                              length,
-                              nbatch,
-                              istride,
-                              idist,
-                              ostride,
-                              odist,
-                              ioffset,
-                              ooffset);
-            break;
-        }
-    }
-    else
-    {
-        throw std::runtime_error("Invalid input and output types.");
-    }
-}
-
-// unroll arbitrary-dimension copy_buffers into specializations for 1-, 2-, 3-dimensions
-template <typename Tallocator1,
-          typename Tallocator2,
-          typename Tint1,
-          typename Tint2,
-          typename Tint3>
-inline void copy_buffers(const std::vector<std::vector<char, Tallocator1>>& input,
-                         std::vector<std::vector<char, Tallocator2>>&       output,
-                         const std::vector<Tint1>&                          length,
-                         const size_t                                       nbatch,
-                         const fft_precision                                precision,
-                         const fft_array_type                               itype,
-                         const std::vector<Tint2>&                          istride,
-                         const size_t                                       idist,
-                         const fft_array_type                               otype,
-                         const std::vector<Tint3>&                          ostride,
-                         const size_t                                       odist,
-                         const std::vector<size_t>&                         ioffset,
-                         const std::vector<size_t>&                         ooffset)
-{
-    switch(length.size())
-    {
-    case 1:
-        return copy_buffers(input,
-                            output,
-                            length[0],
-                            nbatch,
-                            precision,
-                            itype,
-                            istride[0],
-                            idist,
-                            otype,
-                            ostride[0],
-                            odist,
-                            ioffset,
-                            ooffset);
-    case 2:
-        return copy_buffers(input,
-                            output,
-                            std::make_tuple(length[0], length[1]),
-                            nbatch,
-                            precision,
-                            itype,
-                            std::make_tuple(istride[0], istride[1]),
-                            idist,
-                            otype,
-                            std::make_tuple(ostride[0], ostride[1]),
-                            odist,
-                            ioffset,
-                            ooffset);
-    case 3:
-        return copy_buffers(input,
-                            output,
-                            std::make_tuple(length[0], length[1], length[2]),
-                            nbatch,
-                            precision,
-                            itype,
-                            std::make_tuple(istride[0], istride[1], istride[2]),
-                            idist,
-                            otype,
-                            std::make_tuple(ostride[0], ostride[1], ostride[2]),
-                            odist,
-                            ioffset,
-                            ooffset);
-    default:
-        abort();
-    }
-}
-
-// Compute the L-infinity and L-2 distance between two buffers with strides istride and
-// length idist between batches to a buffer with strides ostride and length odist between
-// batches.  Both buffers are of complex type.
-
-struct VectorNorms
-{
-    double l_2 = 0.0, l_inf = 0.0;
-};
-
-template <typename Tcomplex, typename Tint1, typename Tint2, typename Tint3>
-inline VectorNorms distance_1to1_complex(const Tcomplex*                         input,
-                                         const Tcomplex*                         output,
-                                         const Tint1&                            whole_length,
-                                         const size_t                            nbatch,
-                                         const Tint2&                            istride,
-                                         const size_t                            idist,
-                                         const Tint3&                            ostride,
-                                         const size_t                            odist,
-                                         std::vector<std::pair<size_t, size_t>>& linf_failures,
-                                         const double                            linf_cutoff,
-                                         const std::vector<size_t>&              ioffset,
-                                         const std::vector<size_t>&              ooffset)
-{
-    double linf = 0.0;
-    double l2   = 0.0;
-
-    std::mutex linf_failure_lock;
-
-    const bool idx_equals_odx = istride == ostride && idist == odist;
-    size_t     idx_base       = 0;
-    size_t     odx_base       = 0;
-    auto       partitions     = partition_colmajor(whole_length);
-    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
-    {
-#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size())
-        for(size_t part = 0; part < partitions.size(); ++part)
-        {
-            double     cur_linf = 0.0;
-            double     cur_l2   = 0.0;
-            auto       index    = partitions[part].first;
-            const auto length   = partitions[part].second;
-
-            do
-            {
-                const auto   idx = compute_index(index, istride, idx_base);
-                const auto   odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
-                const double rdiff
-                    = std::abs(output[odx + ooffset[0]].real() - input[idx + ioffset[0]].real());
-                cur_linf = std::max(rdiff, cur_linf);
-                if(cur_linf > linf_cutoff)
-                {
-                    std::pair<size_t, size_t> fval(b, idx);
-                    linf_failure_lock.lock();
-                    linf_failures.push_back(fval);
-                    linf_failure_lock.unlock();
-                }
-                cur_l2 += rdiff * rdiff;
-
-                const double idiff
-                    = std::abs(output[odx + ooffset[0]].imag() - input[idx + ioffset[0]].imag());
-                cur_linf = std::max(idiff, cur_linf);
-                if(cur_linf > linf_cutoff)
-                {
-                    std::pair<size_t, size_t> fval(b, idx);
-                    linf_failure_lock.lock();
-                    linf_failures.push_back(fval);
-                    linf_failure_lock.unlock();
-                }
-                cur_l2 += idiff * idiff;
-
-            } while(increment_rowmajor(index, length));
-            linf = std::max(linf, cur_linf);
-            l2 += cur_l2;
-        }
-    }
-    return {.l_2 = sqrt(l2), .l_inf = linf};
-}
-
-// Compute the L-infinity and L-2 distance between two buffers with strides istride and
-// length idist between batches to a buffer with strides ostride and length odist between
-// batches.  Both buffers are of real type.
-template <typename Tfloat, typename Tint1, typename Tint2, typename Tint3>
-inline VectorNorms distance_1to1_real(const Tfloat*                           input,
-                                      const Tfloat*                           output,
-                                      const Tint1&                            whole_length,
-                                      const size_t                            nbatch,
-                                      const Tint2&                            istride,
-                                      const size_t                            idist,
-                                      const Tint3&                            ostride,
-                                      const size_t                            odist,
-                                      std::vector<std::pair<size_t, size_t>>& linf_failures,
-                                      const double                            linf_cutoff,
-                                      const std::vector<size_t>&              ioffset,
-                                      const std::vector<size_t>&              ooffset)
-{
-    double linf = 0.0;
-    double l2   = 0.0;
-
-    std::mutex linf_failure_lock;
-
-    const bool idx_equals_odx = istride == ostride && idist == odist;
-    size_t     idx_base       = 0;
-    size_t     odx_base       = 0;
-    auto       partitions     = partition_rowmajor(whole_length);
-    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
-    {
-#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size())
-        for(size_t part = 0; part < partitions.size(); ++part)
-        {
-            double     cur_linf = 0.0;
-            double     cur_l2   = 0.0;
-            auto       index    = partitions[part].first;
-            const auto length   = partitions[part].second;
-            do
-            {
-                const auto   idx  = compute_index(index, istride, idx_base);
-                const auto   odx  = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
-                const double diff = std::abs(output[odx + ooffset[0]] - input[idx + ioffset[0]]);
-                cur_linf          = std::max(diff, cur_linf);
-                if(cur_linf > linf_cutoff)
-                {
-                    std::pair<size_t, size_t> fval(b, idx);
-                    linf_failure_lock.lock();
-                    linf_failures.push_back(fval);
-                    linf_failure_lock.unlock();
-                }
-                cur_l2 += diff * diff;
-
-            } while(increment_rowmajor(index, length));
-            linf = std::max(linf, cur_linf);
-            l2 += cur_l2;
-        }
-    }
-    return {.l_2 = sqrt(l2), .l_inf = linf};
-}
-
-// Compute the L-infinity and L-2 distance between two buffers with strides istride and
-// length idist between batches to a buffer with strides ostride and length odist between
-// batches.  input is complex-interleaved, output is complex-planar.
-template <typename Tval, typename Tint1, typename T2, typename T3>
-inline VectorNorms distance_1to2(const std::complex<Tval>*               input,
-                                 const Tval*                             output0,
-                                 const Tval*                             output1,
-                                 const Tint1&                            whole_length,
-                                 const size_t                            nbatch,
-                                 const T2&                               istride,
-                                 const size_t                            idist,
-                                 const T3&                               ostride,
-                                 const size_t                            odist,
-                                 std::vector<std::pair<size_t, size_t>>& linf_failures,
-                                 const double                            linf_cutoff,
-                                 const std::vector<size_t>&              ioffset,
-                                 const std::vector<size_t>&              ooffset)
-{
-    double linf = 0.0;
-    double l2   = 0.0;
-
-    std::mutex linf_failure_lock;
-
-    const bool idx_equals_odx = istride == ostride && idist == odist;
-    size_t     idx_base       = 0;
-    size_t     odx_base       = 0;
-    auto       partitions     = partition_rowmajor(whole_length);
-    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
-    {
-#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size())
-        for(size_t part = 0; part < partitions.size(); ++part)
-        {
-            double     cur_linf = 0.0;
-            double     cur_l2   = 0.0;
-            auto       index    = partitions[part].first;
-            const auto length   = partitions[part].second;
-            do
-            {
-                const auto   idx = compute_index(index, istride, idx_base);
-                const auto   odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
-                const double rdiff
-                    = std::abs(output0[odx + ooffset[0]] - input[idx + ioffset[0]].real());
-                cur_linf = std::max(rdiff, cur_linf);
-                if(cur_linf > linf_cutoff)
-                {
-                    std::pair<size_t, size_t> fval(b, idx);
-                    linf_failure_lock.lock();
-                    linf_failures.push_back(fval);
-                    linf_failure_lock.unlock();
-                }
-                cur_l2 += rdiff * rdiff;
-
-                const double idiff
-                    = std::abs(output1[odx + ooffset[1]] - input[idx + ioffset[0]].imag());
-                cur_linf = std::max(idiff, cur_linf);
-                if(cur_linf > linf_cutoff)
-                {
-                    std::pair<size_t, size_t> fval(b, idx);
-                    linf_failure_lock.lock();
-                    linf_failures.push_back(fval);
-                    linf_failure_lock.unlock();
-                }
-                cur_l2 += idiff * idiff;
-
-            } while(increment_rowmajor(index, length));
-            linf = std::max(linf, cur_linf);
-            l2 += cur_l2;
-        }
-    }
-    return {.l_2 = sqrt(l2), .l_inf = linf};
-}
-
-// Compute the L-inifnity and L-2 distance between two buffers of dimension length and
-// with types given by itype, otype, and precision.
-template <typename Tallocator1,
-          typename Tallocator2,
-          typename Tint1,
-          typename Tint2,
-          typename Tint3>
-inline VectorNorms distance(const std::vector<std::vector<char, Tallocator1>>& input,
-                            const std::vector<std::vector<char, Tallocator2>>& output,
-                            const Tint1&                                       length,
-                            const size_t                                       nbatch,
-                            const fft_precision                                precision,
-                            const fft_array_type                               itype,
-                            const Tint2&                                       istride,
-                            const size_t                                       idist,
-                            const fft_array_type                               otype,
-                            const Tint3&                                       ostride,
-                            const size_t                                       odist,
-                            std::vector<std::pair<size_t, size_t>>&            linf_failures,
-                            const double                                       linf_cutoff,
-                            const std::vector<size_t>&                         ioffset,
-                            const std::vector<size_t>&                         ooffset)
-{
-    VectorNorms dist;
-
-    if(itype == otype)
-    {
-        switch(itype)
-        {
-        case fft_array_type_complex_interleaved:
-        case fft_array_type_hermitian_interleaved:
-            switch(precision)
-            {
-            case fft_precision_single:
-                dist = distance_1to1_complex(
-                    reinterpret_cast<const std::complex<float>*>(input[0].data()),
-                    reinterpret_cast<const std::complex<float>*>(output[0].data()),
-                    length,
-                    nbatch,
-                    istride,
-                    idist,
-                    ostride,
-                    odist,
-                    linf_failures,
-                    linf_cutoff,
-                    ioffset,
-                    ooffset);
-                break;
-            case fft_precision_double:
-                dist = distance_1to1_complex(
-                    reinterpret_cast<const std::complex<double>*>(input[0].data()),
-                    reinterpret_cast<const std::complex<double>*>(output[0].data()),
-                    length,
-                    nbatch,
-                    istride,
-                    idist,
-                    ostride,
-                    odist,
-                    linf_failures,
-                    linf_cutoff,
-                    ioffset,
-                    ooffset);
-                break;
-            }
-            dist.l_2 *= dist.l_2;
-            break;
-        case fft_array_type_real:
-        case fft_array_type_complex_planar:
-        case fft_array_type_hermitian_planar:
-            for(unsigned int idx = 0; idx < input.size(); ++idx)
-            {
-                VectorNorms d;
-                switch(precision)
-                {
-                case fft_precision_single:
-                    d = distance_1to1_real(reinterpret_cast<const float*>(input[idx].data()),
-                                           reinterpret_cast<const float*>(output[idx].data()),
-                                           length,
-                                           nbatch,
-                                           istride,
-                                           idist,
-                                           ostride,
-                                           odist,
-                                           linf_failures,
-                                           linf_cutoff,
-                                           ioffset,
-                                           ooffset);
-                    break;
-                case fft_precision_double:
-                    d = distance_1to1_real(reinterpret_cast<const double*>(input[idx].data()),
-                                           reinterpret_cast<const double*>(output[idx].data()),
-                                           length,
-                                           nbatch,
-                                           istride,
-                                           idist,
-                                           ostride,
-                                           odist,
-                                           linf_failures,
-                                           linf_cutoff,
-                                           ioffset,
-                                           ooffset);
-                    break;
-                }
-                dist.l_inf = std::max(d.l_inf, dist.l_inf);
-                dist.l_2 += d.l_2 * d.l_2;
-            }
-            break;
-        default:
-            throw std::runtime_error("Invalid input and output types.");
-        }
-    }
-    else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar)
-            || (itype == fft_array_type_hermitian_interleaved
-                && otype == fft_array_type_hermitian_planar))
-    {
-        switch(precision)
-        {
-        case fft_precision_single:
-            dist = distance_1to2(reinterpret_cast<const std::complex<float>*>(input[0].data()),
-                                 reinterpret_cast<const float*>(output[0].data()),
-                                 reinterpret_cast<const float*>(output[1].data()),
-                                 length,
-                                 nbatch,
-                                 istride,
-                                 idist,
-                                 ostride,
-                                 odist,
-                                 linf_failures,
-                                 linf_cutoff,
-                                 ioffset,
-                                 ooffset);
-            break;
-        case fft_precision_double:
-            dist = distance_1to2(reinterpret_cast<const std::complex<double>*>(input[0].data()),
-                                 reinterpret_cast<const double*>(output[0].data()),
-                                 reinterpret_cast<const double*>(output[1].data()),
-                                 length,
-                                 nbatch,
-                                 istride,
-                                 idist,
-                                 ostride,
-                                 odist,
-                                 linf_failures,
-                                 linf_cutoff,
-                                 ioffset,
-                                 ooffset);
-            break;
-        }
-        dist.l_2 *= dist.l_2;
-    }
-    else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved)
-            || (itype == fft_array_type_hermitian_planar
-                && otype == fft_array_type_hermitian_interleaved))
-    {
-        switch(precision)
-        {
-        case fft_precision_single:
-            dist = distance_1to2(reinterpret_cast<const std::complex<float>*>(output[0].data()),
-                                 reinterpret_cast<const float*>(input[0].data()),
-                                 reinterpret_cast<const float*>(input[1].data()),
-                                 length,
-                                 nbatch,
-                                 ostride,
-                                 odist,
-                                 istride,
-                                 idist,
-                                 linf_failures,
-                                 linf_cutoff,
-                                 ioffset,
-                                 ooffset);
-            break;
-        case fft_precision_double:
-            dist = distance_1to2(reinterpret_cast<const std::complex<double>*>(output[0].data()),
-                                 reinterpret_cast<const double*>(input[0].data()),
-                                 reinterpret_cast<const double*>(input[1].data()),
-                                 length,
-                                 nbatch,
-                                 ostride,
-                                 odist,
-                                 istride,
-                                 idist,
-                                 linf_failures,
-                                 linf_cutoff,
-                                 ioffset,
-                                 ooffset);
-            break;
-        }
-        dist.l_2 *= dist.l_2;
-    }
-    else
-    {
-        throw std::runtime_error("Invalid input and output types.");
-    }
-    dist.l_2 = sqrt(dist.l_2);
-    return dist;
-}
-
-// Unroll arbitrary-dimension distance into specializations for 1-, 2-, 3-dimensions
-template <typename Tallocator1,
-          typename Tallocator2,
-          typename Tint1,
-          typename Tint2,
-          typename Tint3>
-inline VectorNorms distance(const std::vector<std::vector<char, Tallocator1>>& input,
-                            const std::vector<std::vector<char, Tallocator2>>& output,
-                            const std::vector<Tint1>&                          length,
-                            const size_t                                       nbatch,
-                            const fft_precision                                precision,
-                            const fft_array_type                               itype,
-                            const std::vector<Tint2>&                          istride,
-                            const size_t                                       idist,
-                            const fft_array_type                               otype,
-                            const std::vector<Tint3>&                          ostride,
-                            const size_t                                       odist,
-                            std::vector<std::pair<size_t, size_t>>&            linf_failures,
-                            const double                                       linf_cutoff,
-                            const std::vector<size_t>&                         ioffset,
-                            const std::vector<size_t>&                         ooffset)
-{
-    switch(length.size())
-    {
-    case 1:
-        return distance(input,
-                        output,
-                        length[0],
-                        nbatch,
-                        precision,
-                        itype,
-                        istride[0],
-                        idist,
-                        otype,
-                        ostride[0],
-                        odist,
-                        linf_failures,
-                        linf_cutoff,
-                        ioffset,
-                        ooffset);
-    case 2:
-        return distance(input,
-                        output,
-                        std::make_tuple(length[0], length[1]),
-                        nbatch,
-                        precision,
-                        itype,
-                        std::make_tuple(istride[0], istride[1]),
-                        idist,
-                        otype,
-                        std::make_tuple(ostride[0], ostride[1]),
-                        odist,
-                        linf_failures,
-                        linf_cutoff,
-                        ioffset,
-                        ooffset);
-    case 3:
-        return distance(input,
-                        output,
-                        std::make_tuple(length[0], length[1], length[2]),
-                        nbatch,
-                        precision,
-                        itype,
-                        std::make_tuple(istride[0], istride[1], istride[2]),
-                        idist,
-                        otype,
-                        std::make_tuple(ostride[0], ostride[1], ostride[2]),
-                        odist,
-                        linf_failures,
-                        linf_cutoff,
-                        ioffset,
-                        ooffset);
-    default:
-        abort();
-    }
-}
-
-// Compute the L-infinity and L-2 norm of a buffer with strides istride and
-// length idist.  Data is std::complex.
-template <typename Tcomplex, typename T1, typename T2>
-inline VectorNorms norm_complex(const Tcomplex*            input,
-                                const T1&                  whole_length,
-                                const size_t               nbatch,
-                                const T2&                  istride,
-                                const size_t               idist,
-                                const std::vector<size_t>& offset)
-{
-    double linf = 0.0;
-    double l2   = 0.0;
-
-    size_t idx_base   = 0;
-    auto   partitions = partition_rowmajor(whole_length);
-    for(size_t b = 0; b < nbatch; b++, idx_base += idist)
-    {
-#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size())
-        for(size_t part = 0; part < partitions.size(); ++part)
-        {
-            double     cur_linf = 0.0;
-            double     cur_l2   = 0.0;
-            auto       index    = partitions[part].first;
-            const auto length   = partitions[part].second;
-            do
-            {
-                const auto idx = compute_index(index, istride, idx_base);
-
-                const double rval = std::abs(input[idx + offset[0]].real());
-                cur_linf          = std::max(rval, cur_linf);
-                cur_l2 += rval * rval;
-
-                const double ival = std::abs(input[idx + offset[0]].imag());
-                cur_linf          = std::max(ival, cur_linf);
-                cur_l2 += ival * ival;
-
-            } while(increment_rowmajor(index, length));
-            linf = std::max(linf, cur_linf);
-            l2 += cur_l2;
-        }
-    }
-    return {.l_2 = sqrt(l2), .l_inf = linf};
-}
-
-// Compute the L-infinity and L-2 norm of abuffer with strides istride and
-// length idist.  Data is real-valued.
-template <typename Tfloat, typename T1, typename T2>
-inline VectorNorms norm_real(const Tfloat*              input,
-                             const T1&                  whole_length,
-                             const size_t               nbatch,
-                             const T2&                  istride,
-                             const size_t               idist,
-                             const std::vector<size_t>& offset)
-{
-    double linf = 0.0;
-    double l2   = 0.0;
-
-    size_t idx_base   = 0;
-    auto   partitions = partition_rowmajor(whole_length);
-    for(size_t b = 0; b < nbatch; b++, idx_base += idist)
-    {
-#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size())
-        for(size_t part = 0; part < partitions.size(); ++part)
-        {
-            double     cur_linf = 0.0;
-            double     cur_l2   = 0.0;
-            auto       index    = partitions[part].first;
-            const auto length   = partitions[part].second;
-            do
-            {
-                const auto   idx = compute_index(index, istride, idx_base);
-                const double val = std::abs(input[idx + offset[0]]);
-                cur_linf         = std::max(val, cur_linf);
-                cur_l2 += val * val;
-
-            } while(increment_rowmajor(index, length));
-            linf = std::max(linf, cur_linf);
-            l2 += cur_l2;
-        }
-    }
-    return {.l_2 = sqrt(l2), .l_inf = linf};
-}
-
-// Compute the L-infinity and L-2 norm of abuffer with strides istride and
-// length idist.  Data format is given by precision and itype.
-template <typename Tallocator1, typename T1, typename T2>
-inline VectorNorms norm(const std::vector<std::vector<char, Tallocator1>>& input,
-                        const T1&                                          length,
-                        const size_t                                       nbatch,
-                        const fft_precision                                precision,
-                        const fft_array_type                               itype,
-                        const T2&                                          istride,
-                        const size_t                                       idist,
-                        const std::vector<size_t>&                         offset)
-{
-    VectorNorms norm;
-
-    switch(itype)
-    {
-    case fft_array_type_complex_interleaved:
-    case fft_array_type_hermitian_interleaved:
-        switch(precision)
-        {
-        case fft_precision_single:
-            norm = norm_complex(reinterpret_cast<const std::complex<float>*>(input[0].data()),
-                                length,
-                                nbatch,
-                                istride,
-                                idist,
-                                offset);
-            break;
-        case fft_precision_double:
-            norm = norm_complex(reinterpret_cast<const std::complex<double>*>(input[0].data()),
-                                length,
-                                nbatch,
-                                istride,
-                                idist,
-                                offset);
-            break;
-        }
-        norm.l_2 *= norm.l_2;
-        break;
-    case fft_array_type_real:
-    case fft_array_type_complex_planar:
-    case fft_array_type_hermitian_planar:
-        for(unsigned int idx = 0; idx < input.size(); ++idx)
-        {
-            VectorNorms n;
-            switch(precision)
-            {
-            case fft_precision_single:
-                n = norm_real(reinterpret_cast<const float*>(input[idx].data()),
-                              length,
-                              nbatch,
-                              istride,
-                              idist,
-                              offset);
-                break;
-            case fft_precision_double:
-                n = norm_real(reinterpret_cast<const double*>(input[idx].data()),
-                              length,
-                              nbatch,
-                              istride,
-                              idist,
-                              offset);
-                break;
-            }
-            norm.l_inf = std::max(n.l_inf, norm.l_inf);
-            norm.l_2 += n.l_2 * n.l_2;
-        }
-        break;
-    default:
-        throw std::runtime_error("Invalid data type");
-    }
-
-    norm.l_2 = sqrt(norm.l_2);
-    return norm;
-}
-
-// Unroll arbitrary-dimension norm into specializations for 1-, 2-, 3-dimensions
-template <typename Tallocator1, typename T1, typename T2>
-inline VectorNorms norm(const std::vector<std::vector<char, Tallocator1>>& input,
-                        const std::vector<T1>&                             length,
-                        const size_t                                       nbatch,
-                        const fft_precision                                precision,
-                        const fft_array_type                               type,
-                        const std::vector<T2>&                             stride,
-                        const size_t                                       dist,
-                        const std::vector<size_t>&                         offset)
-{
-    switch(length.size())
-    {
-    case 1:
-        return norm(input, length[0], nbatch, precision, type, stride[0], dist, offset);
-    case 2:
-        return norm(input,
-                    std::make_tuple(length[0], length[1]),
-                    nbatch,
-                    precision,
-                    type,
-                    std::make_tuple(stride[0], stride[1]),
-                    dist,
-                    offset);
-    case 3:
-        return norm(input,
-                    std::make_tuple(length[0], length[1], length[2]),
-                    nbatch,
-                    precision,
-                    type,
-                    std::make_tuple(stride[0], stride[1], stride[2]),
-                    dist,
-                    offset);
-    default:
-        abort();
-    }
-}
-
-// Given an array type and transform length, strides, etc, load random floats in [0,1]
-// into the input array of floats/doubles or complex floats/doubles gpu buffers.
-template <typename Tfloat, typename Tint1>
-inline void set_input(std::vector<gpubuf>&       input,
-                      const fft_array_type       itype,
-                      const std::vector<size_t>& length,
-                      const std::vector<size_t>& ilength,
-                      const std::vector<size_t>& stride,
-                      const Tint1&               whole_length,
-                      const Tint1&               istride,
-                      const size_t               idist,
-                      const size_t               nbatch)
-{
-    auto isize = count_iters(whole_length) * nbatch;
-
-    switch(itype)
-    {
-    case fft_array_type_complex_interleaved:
-    case fft_array_type_hermitian_interleaved:
-    {
-        auto ibuffer = (std::complex<Tfloat>*)input[0].data();
-
-        generate_interleaved_data(whole_length, idist, isize, istride, ibuffer);
-
-        if(itype == fft_array_type_hermitian_interleaved)
-            impose_hermitian_symmetry_interleaved(length, ilength, stride, idist, nbatch, ibuffer);
-
-        break;
-    }
-    case fft_array_type_complex_planar:
-    case fft_array_type_hermitian_planar:
-    {
-        auto ibuffer_real = (Tfloat*)input[0].data();
-        auto ibuffer_imag = (Tfloat*)input[1].data();
-
-        generate_planar_data(whole_length, idist, isize, istride, ibuffer_real, ibuffer_imag);
-
-        if(itype == fft_array_type_hermitian_planar)
-            impose_hermitian_symmetry_planar(
-                length, ilength, stride, idist, nbatch, ibuffer_real, ibuffer_imag);
-
-        break;
-    }
-    case fft_array_type_real:
-    {
-        auto ibuffer = (Tfloat*)input[0].data();
-
-        generate_real_data(whole_length, idist, isize, istride, ibuffer);
-
-        break;
-    }
-    default:
-        throw std::runtime_error("Input layout format not yet supported");
-    }
-}
-
-// unroll set_input for dimension 1, 2, 3
-template <typename Tfloat>
-inline void set_input(std::vector<gpubuf>&       input,
-                      const fft_array_type       itype,
-                      const std::vector<size_t>& length,
-                      const std::vector<size_t>& ilength,
-                      const std::vector<size_t>& istride,
-                      const size_t               idist,
-                      const size_t               nbatch)
-{
-    switch(length.size())
-    {
-    case 1:
-        set_input<Tfloat>(
-            input, itype, length, ilength, istride, ilength[0], istride[0], idist, nbatch);
-        break;
-    case 2:
-        set_input<Tfloat>(input,
-                          itype,
-                          length,
-                          ilength,
-                          istride,
-                          std::make_tuple(ilength[0], ilength[1]),
-                          std::make_tuple(istride[0], istride[1]),
-                          idist,
-                          nbatch);
-        break;
-    case 3:
-        set_input<Tfloat>(input,
-                          itype,
-                          length,
-                          ilength,
-                          istride,
-                          std::make_tuple(ilength[0], ilength[1], ilength[2]),
-                          std::make_tuple(istride[0], istride[1], istride[2]),
-                          idist,
-                          nbatch);
-        break;
-    default:
-        abort();
-    }
-}
-
-// Given a data type and precision, the distance between batches, and
-// the batch size, allocate the required host buffer(s).
-template <typename Allocator = std::allocator<char>>
-inline std::vector<std::vector<char, Allocator>> allocate_host_buffer(
-    const fft_precision precision, const fft_array_type type, const std::vector<size_t>& size)
-{
-    std::vector<std::vector<char, Allocator>> buffers(size.size());
-    for(unsigned int i = 0; i < size.size(); ++i)
-    {
-        buffers[i].resize(size[i] * var_size<size_t>(precision, type));
-    }
-    return buffers;
-}
-
-// Given a data type and dimensions, fill the buffer, imposing Hermitian symmetry if
-// necessary.
-inline void compute_input(const fft_params& params, std::vector<gpubuf>& input)
-{
-    switch(params.precision)
-    {
-    case fft_precision_double:
-        set_input<double>(input,
-                          params.itype,
-                          params.length,
-                          params.ilength(),
-                          params.istride,
-                          params.idist,
-                          params.nbatch);
-        break;
-    case fft_precision_single:
-        set_input<float>(input,
-                         params.itype,
-                         params.length,
-                         params.ilength(),
-                         params.istride,
-                         params.idist,
-                         params.nbatch);
-        break;
-    }
-}
-
-// Check if the required buffers fit in the device vram.
-inline bool vram_fits_problem(const size_t prob_size, int deviceId = 0)
-{
-    // We keep a small margin of error for fitting the problem into vram:
-    const size_t extra = 1 << 20;
-
-    // Check free and total available memory:
-    size_t free   = 0;
-    size_t total  = 0;
-    auto   retval = hipMemGetInfo(&free, &total);
-
-    if(retval != hipSuccess)
-        throw std::runtime_error("Failure in hipMemGetInfo");
-
-    if(total < prob_size + extra)
-        return false;
-
-    if(free < prob_size + extra)
-        return false;
-
-    return true;
-}
-
-// Computes the twiddle table VRAM footprint for r2c/c2r transforms.
-// This function will return 0 for the other transform types, since
-// the VRAM footprint in rocFFT is negligible for the other cases.
-inline size_t twiddle_table_vram_footprint(const fft_params& params)
-{
-    size_t vram_footprint = 0;
-
-    // Add vram footprint from real/complex even twiddle buffer size.
-    if(params.transform_type == fft_transform_type_real_forward
-       || params.transform_type == fft_transform_type_real_inverse)
-    {
-        const auto realdim = params.length.back();
-        if(realdim % 2 == 0)
-        {
-            const auto complex_size = params.precision == fft_precision_single ? 8 : 16;
-            // even length twiddle size is 1/4 of the real size, but
-            // in complex elements
-            vram_footprint += realdim * complex_size / 4;
-        }
-    }
-
-    return vram_footprint;
-}
-
-#endif
diff -Nru rocfft-5.5.0/clients/rider/CMakeLists.txt rocfft-5.7.1/clients/rider/CMakeLists.txt
--- rocfft-5.5.0/clients/rider/CMakeLists.txt	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/rider/CMakeLists.txt	2023-08-09 16:19:51.000000000 +0000
@@ -60,8 +60,8 @@
   find_package( ROCM 0.7.3 REQUIRED )
 endif()
 
-if( NOT rocrand_FOUND )
-  find_package( rocrand REQUIRED )
+if( NOT hiprand_FOUND )
+  find_package( hiprand REQUIRED )
 endif()
 
 include( ROCMInstallTargets )
@@ -71,7 +71,7 @@
 
 set( rider_list rocfft-rider dyna-rocfft-rider )
 foreach( rider ${rider_list})
-  
+
   if(${rider} STREQUAL "rocfft-rider")
     add_executable( ${rider} ../../shared/array_validator.cpp rider.cpp rider.h )
   else()
@@ -82,7 +82,7 @@
 
   # NB: hip-clang includes omp.h, so we need to specify the location
   # of ROCM_CLANG_ROOT at cmake config time if we are using clang++.
-  
+
   target_include_directories( ${rider}
     PRIVATE
     $<BUILD_INTERFACE:${Boost_INCLUDE_DIRS}>
@@ -96,16 +96,16 @@
       PRIVATE
       hip::device
       roc::rocfft
-      roc::rocrand
+      hip::hiprand
       Boost::program_options
       )
   else()
-    target_link_libraries( ${rider} 
+    target_link_libraries( ${rider}
       PRIVATE
       ${CMAKE_DL_LIBS}
       hip::device
-      roc::rocrand
-      ${Boost_LIBRARIES}      
+      hip::hiprand
+      ${Boost_LIBRARIES}
       )
 
     # We need to include both rocfft.h and rocfft-export.h
@@ -136,10 +136,10 @@
   endif()
   string( CONCAT RIDER_OUT_DIR "${PROJECT_BINARY_DIR}" ${RIDER_OUT_DIR} )
 
-  set_target_properties(${rider} 
-                        PROPERTIES 
-                        RUNTIME_OUTPUT_DIRECTORY 
+  set_target_properties(${rider}
+                        PROPERTIES
+                        RUNTIME_OUTPUT_DIRECTORY
                         ${RIDER_OUT_DIR} )
-  
+
   rocm_install(TARGETS ${rider} COMPONENT benchmarks)
 endforeach()
diff -Nru rocfft-5.5.0/clients/rider/dyna-rider.cpp rocfft-5.7.1/clients/rider/dyna-rider.cpp
--- rocfft-5.5.0/clients/rider/dyna-rider.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/rider/dyna-rider.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -22,7 +22,7 @@
 // This allows one to randomize the execution order for better a better experimental setup
 // which produces fewer type 1 errors where one incorrectly rejects the null hypothesis.
 
-#include <complex>
+#include <algorithm>
 #include <hip/hip_runtime_api.h>
 #include <iostream>
 #include <math.h>
@@ -38,7 +38,7 @@
 #endif
 
 #include "../../shared/gpubuf.h"
-#include "../rocfft_params.h"
+#include "../../shared/rocfft_params.h"
 #include "rider.h"
 #include "rocfft.h"
 
@@ -340,9 +340,12 @@
     // hip Device number for running tests:
     int deviceId{};
 
-    // Number of performance trial samples
+    // Number of performance trial samples:
     int ntrial{};
 
+    // Test sequence choice:
+    int test_sequence{};
+
     // Vector of test target libraries
     std::vector<std::string> libs;
 
@@ -362,8 +365,11 @@
         ("device", po::value<int>(&deviceId)->default_value(0), "Select a specific device id")
         ("verbose", po::value<int>(&verbose)->default_value(0), "Control output verbosity")
         ("ntrial,N", po::value<int>(&ntrial)->default_value(1), "Trial size for the problem")
+        ("sequence", po::value<int>(&test_sequence)->default_value(0),
+         "Test sequence: random(0), alternating(1) sequential(2)")
         ("notInPlace,o", "Not in-place FFT transform (default: in-place)")
-        ("double", "Double precision transform (default: single)")
+        ("double", "Double precision transform (deprecated: use --precision double)")
+        ("precision", po::value<fft_precision>(&params.precision), "Transform precision: single (default), double, half")
         ("transformType,t", po::value<fft_transform_type>(&params.transform_type)
          ->default_value(fft_transform_type_complex_forward),
          "Type of transform:\n0) complex forward\n1) complex inverse\n2) real "
@@ -394,7 +400,7 @@
         ("ioffset", po::value<std::vector<size_t>>(&params.ioffset)->multitoken(), "Input offsets.")
         ("ooffset", po::value<std::vector<size_t>>(&params.ooffset)->multitoken(), "Output offsets.")
         ("scalefactor", po::value<double>(&params.scale_factor), "Scale factor to apply to output.")
-        ("token", po::value<std::string>(&token));;
+        ("token", po::value<std::string>(&token));
     // clang-format on
 
     po::variables_map vm;
@@ -446,7 +452,8 @@
 
         params.placement
             = vm.count("notInPlace") ? fft_placement_notinplace : fft_placement_inplace;
-        params.precision = vm.count("double") ? fft_precision_double : fft_precision_single;
+        if(vm.count("double"))
+            params.precision = fft_precision_double;
 
         if(vm.count("notInPlace"))
         {
@@ -524,9 +531,14 @@
         std::cout << params.str() << std::endl;
     }
 
+    // Check free and total available memory:
+    size_t free  = 0;
+    size_t total = 0;
+    HIP_V_THROW(hipMemGetInfo(&free, &total), "hipMemGetInfo failed");
+
     const auto raw_vram_footprint
         = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params);
-    if(!vram_fits_problem(raw_vram_footprint))
+    if(!vram_fits_problem(raw_vram_footprint, free))
     {
         std::cout << "SKIPPED: Problem size (" << raw_vram_footprint
                   << ") raw data too large for device.\n";
@@ -534,7 +546,7 @@
     }
 
     const auto vram_footprint = params.vram_footprint();
-    if(!vram_fits_problem(vram_footprint))
+    if(!vram_fits_problem(vram_footprint, free))
     {
         std::cout << "SKIPPED: Problem size (" << vram_footprint
                   << ") raw data too large for device.\n";
@@ -618,7 +630,7 @@
     }
 
     // Input data:
-    compute_input(params, ibuffer);
+    params.compute_input(ibuffer);
 
     if(verbose > 1)
     {
@@ -671,21 +683,64 @@
     // Execution times for loaded libraries:
     std::vector<std::vector<double>> time(libs.size());
 
+    std::vector<int> testcase(ntrial * libs.size());
+    switch(test_sequence)
+    {
+    case 0:
+    {
+        // Random order:
+        for(int itrial = 0; itrial < ntrial; ++itrial)
+        {
+            for(size_t ilib = 0; ilib < libs.size(); ++ilib)
+            {
+                testcase[libs.size() * itrial + ilib] = ilib;
+            }
+        }
+
+        std::random_device rd;
+        std::mt19937       g(rd());
+        std::shuffle(testcase.begin(), testcase.end(), g);
+        break;
+    }
+    case 1:
+        // Alternating order:
+        for(int itrial = 0; itrial < ntrial; ++itrial)
+        {
+            for(size_t ilib = 0; ilib < libs.size(); ++ilib)
+            {
+                testcase[libs.size() * itrial + ilib] = ilib;
+            }
+        }
+        break;
+    case 2:
+        // Sequential order:
+        for(int itrial = 0; itrial < ntrial; ++itrial)
+        {
+            for(size_t ilib = 0; ilib < libs.size(); ++ilib)
+            {
+                testcase[ilib * ntrial + itrial] = ilib;
+            }
+        }
+
+        break;
+    default:
+        throw std::runtime_error("Invalid test sequence choice.");
+    }
+
+    std::cout << "test case:";
+    for(const auto i : testcase)
+        std::cout << " " << i;
+    std::cout << "\n";
+
     // Run the FFTs from the different libraries in random order until they all have at
     // least ntrial times.
     std::vector<int> ndone(libs.size());
     std::fill(ndone.begin(), ndone.end(), 0);
-    while(!std::all_of(ndone.begin(), ndone.end(), [&ntrial](int i) { return (i >= ntrial); }))
+    for(size_t itest = 0; itest < testcase.size(); ++itest)
     {
-        const int idx = rand() % ndone.size();
-        ndone[idx]++;
-
-        // We can optionally require that all runs have exactly ntrial, but it may be more
-        // iid to just let things run:
-        // if(ndone[idx] > ntrial)
-        //     continue;
+        const int idx = testcase[itest];
 
-        compute_input(params, ibuffer);
+        params.compute_input(ibuffer);
 
         // Run the plan using its associated rocFFT library:
         time[idx].push_back(
diff -Nru rocfft-5.5.0/clients/rider/rider.cpp rocfft-5.7.1/clients/rider/rider.cpp
--- rocfft-5.5.0/clients/rider/rider.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/rider/rider.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -24,7 +24,7 @@
 #include <sstream>
 
 #include "../../shared/gpubuf.h"
-#include "../rocfft_params.h"
+#include "../../shared/rocfft_params.h"
 #include "rider.h"
 #include "rocfft.h"
 #include <boost/program_options.hpp>
@@ -61,7 +61,8 @@
         ("verbose", po::value<int>(&verbose)->default_value(0), "Control output verbosity")
         ("ntrial,N", po::value<int>(&ntrial)->default_value(1), "Trial size for the problem")
         ("notInPlace,o", "Not in-place FFT transform (default: in-place)")
-        ("double", "Double precision transform (default: single)")
+        ("double", "Double precision transform (deprecated: use --precision double)")
+        ("precision", po::value<fft_precision>(&params.precision), "Transform precision: single (default), double, half")
         ("transformType,t", po::value<fft_transform_type>(&params.transform_type)
          ->default_value(fft_transform_type_complex_forward),
          "Type of transform:\n0) complex forward\n1) complex inverse\n2) real "
@@ -141,7 +142,8 @@
 
         params.placement
             = vm.count("notInPlace") ? fft_placement_notinplace : fft_placement_inplace;
-        params.precision = vm.count("double") ? fft_precision_double : fft_precision_single;
+        if(vm.count("double"))
+            params.precision = fft_precision_double;
 
         if(vm.count("notInPlace"))
         {
@@ -221,9 +223,13 @@
         std::cout << params.str(" ") << std::endl;
     }
 
+    // Check free and total available memory:
+    size_t free  = 0;
+    size_t total = 0;
+    HIP_V_THROW(hipMemGetInfo(&free, &total), "hipMemGetInfo failed");
     const auto raw_vram_footprint
         = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params);
-    if(!vram_fits_problem(raw_vram_footprint))
+    if(!vram_fits_problem(raw_vram_footprint, free))
     {
         std::cout << "SKIPPED: Problem size (" << raw_vram_footprint
                   << ") raw data too large for device.\n";
@@ -231,7 +237,7 @@
     }
 
     const auto vram_footprint = params.vram_footprint();
-    if(!vram_fits_problem(vram_footprint))
+    if(!vram_fits_problem(vram_footprint, free))
     {
         std::cout << "SKIPPED: Problem size (" << vram_footprint
                   << ") raw data too large for device.\n";
@@ -253,7 +259,7 @@
     }
 
     // Input data:
-    compute_input(params, ibuffer);
+    params.compute_input(ibuffer);
 
     if(verbose > 1)
     {
@@ -304,7 +310,7 @@
     HIP_V_THROW(hipEventCreate(&stop), "hipEventCreate failed");
     for(unsigned int itrial = 0; itrial < gpu_time.size(); ++itrial)
     {
-        compute_input(params, ibuffer);
+        params.compute_input(ibuffer);
 
         HIP_V_THROW(hipEventRecord(start), "hipEventRecord failed");
 
diff -Nru rocfft-5.5.0/clients/rocfft_params.h rocfft-5.7.1/clients/rocfft_params.h
--- rocfft-5.5.0/clients/rocfft_params.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/rocfft_params.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,313 +0,0 @@
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-#ifndef ROCFFT_PARAMS_H
-#define ROCFFT_PARAMS_H
-
-#include "../shared/gpubuf.h"
-#include "fft_params.h"
-#include "rocfft.h"
-
-inline fft_status fft_status_from_rocfftparams(const rocfft_status val)
-{
-    switch(val)
-    {
-    case rocfft_status_success:
-        return fft_status_success;
-    case rocfft_status_failure:
-        return fft_status_failure;
-    case rocfft_status_invalid_arg_value:
-        return fft_status_invalid_arg_value;
-    case rocfft_status_invalid_dimensions:
-        return fft_status_invalid_dimensions;
-    case rocfft_status_invalid_array_type:
-        return fft_status_invalid_array_type;
-    case rocfft_status_invalid_strides:
-        return fft_status_invalid_strides;
-    case rocfft_status_invalid_distance:
-        return fft_status_invalid_distance;
-    case rocfft_status_invalid_offset:
-        return fft_status_invalid_offset;
-    case rocfft_status_invalid_work_buffer:
-        return fft_status_invalid_work_buffer;
-    default:
-        throw std::runtime_error("Invalid status");
-    }
-}
-
-inline rocfft_precision rocfft_precision_from_fftparams(const fft_precision val)
-{
-    switch(val)
-    {
-    case fft_precision_single:
-        return rocfft_precision_single;
-    case fft_precision_double:
-        return rocfft_precision_double;
-    default:
-        throw std::runtime_error("Invalid precision");
-    }
-}
-
-inline rocfft_array_type rocfft_array_type_from_fftparams(const fft_array_type val)
-{
-    switch(val)
-    {
-    case fft_array_type_complex_interleaved:
-        return rocfft_array_type_complex_interleaved;
-    case fft_array_type_complex_planar:
-        return rocfft_array_type_complex_planar;
-    case fft_array_type_real:
-        return rocfft_array_type_real;
-    case fft_array_type_hermitian_interleaved:
-        return rocfft_array_type_hermitian_interleaved;
-    case fft_array_type_hermitian_planar:
-        return rocfft_array_type_hermitian_planar;
-    case fft_array_type_unset:
-        return rocfft_array_type_unset;
-    }
-    return rocfft_array_type_unset;
-}
-
-inline rocfft_transform_type rocfft_transform_type_from_fftparams(const fft_transform_type val)
-{
-    switch(val)
-    {
-    case fft_transform_type_complex_forward:
-        return rocfft_transform_type_complex_forward;
-    case fft_transform_type_complex_inverse:
-        return rocfft_transform_type_complex_inverse;
-    case fft_transform_type_real_forward:
-        return rocfft_transform_type_real_forward;
-    case fft_transform_type_real_inverse:
-        return rocfft_transform_type_real_inverse;
-    default:
-        throw std::runtime_error("Invalid transform type");
-    }
-}
-
-inline rocfft_result_placement
-    rocfft_result_placement_from_fftparams(const fft_result_placement val)
-{
-    switch(val)
-    {
-    case fft_placement_inplace:
-        return rocfft_placement_inplace;
-    case fft_placement_notinplace:
-        return rocfft_placement_notinplace;
-    default:
-        throw std::runtime_error("Invalid result placement");
-    }
-}
-
-class rocfft_params : public fft_params
-{
-public:
-    rocfft_plan             plan = nullptr;
-    rocfft_execution_info   info = nullptr;
-    rocfft_plan_description desc = nullptr;
-    gpubuf_t<void>          wbuffer;
-
-    explicit rocfft_params(){};
-
-    explicit rocfft_params(const fft_params& p)
-        : fft_params(p){};
-
-    rocfft_params(const rocfft_params&) = delete;
-    rocfft_params& operator=(const rocfft_params&) = delete;
-
-    ~rocfft_params()
-    {
-        free();
-    };
-
-    void free()
-    {
-        if(plan != nullptr)
-        {
-            rocfft_plan_destroy(plan);
-            plan = nullptr;
-        }
-        if(info != nullptr)
-        {
-            rocfft_execution_info_destroy(info);
-            info = nullptr;
-        }
-        if(desc != nullptr)
-        {
-            rocfft_plan_description_destroy(desc);
-            desc = nullptr;
-        }
-    }
-
-    rocfft_precision get_rocfft_precision()
-    {
-        return rocfft_precision_from_fftparams(precision);
-    }
-
-    size_t vram_footprint() override
-    {
-        size_t val = fft_params::vram_footprint();
-        if(setup_structs() != fft_status_success)
-        {
-            throw std::runtime_error("Struct setup failed");
-        }
-        val += workbuffersize;
-
-        return val;
-    }
-
-    fft_status setup_structs()
-    {
-        rocfft_status fft_status = rocfft_status_success;
-        if(desc == nullptr)
-        {
-            rocfft_plan_description_create(&desc);
-            if(fft_status != rocfft_status_success)
-                return fft_status_from_rocfftparams(fft_status);
-
-            fft_status
-                = rocfft_plan_description_set_data_layout(desc,
-                                                          rocfft_array_type_from_fftparams(itype),
-                                                          rocfft_array_type_from_fftparams(otype),
-                                                          ioffset.data(),
-                                                          ooffset.data(),
-                                                          istride_cm().size(),
-                                                          istride_cm().data(),
-                                                          idist,
-                                                          ostride_cm().size(),
-                                                          ostride_cm().data(),
-                                                          odist);
-            if(fft_status != rocfft_status_success)
-            {
-                throw std::runtime_error("rocfft_plan_description_set_data_layout failed");
-            }
-
-            if(scale_factor != 1.0)
-            {
-                fft_status = rocfft_plan_description_set_scale_factor(desc, scale_factor);
-                if(fft_status != rocfft_status_success)
-                {
-                    throw std::runtime_error("rocfft_plan_description_set_scale_factor failed");
-                }
-            }
-        }
-
-        if(plan == nullptr)
-        {
-            fft_status = rocfft_plan_create(&plan,
-                                            rocfft_result_placement_from_fftparams(placement),
-                                            rocfft_transform_type_from_fftparams(transform_type),
-                                            get_rocfft_precision(),
-                                            length_cm().size(),
-                                            length_cm().data(),
-                                            nbatch,
-                                            desc);
-            if(fft_status != rocfft_status_success)
-            {
-                throw std::runtime_error("rocfft_plan_create failed");
-            }
-        }
-
-        if(info == nullptr)
-        {
-            fft_status = rocfft_execution_info_create(&info);
-            if(fft_status != rocfft_status_success)
-            {
-                throw std::runtime_error("rocfft_execution_info_create failed");
-            }
-        }
-
-        fft_status = rocfft_plan_get_work_buffer_size(plan, &workbuffersize);
-        if(fft_status != rocfft_status_success)
-        {
-            throw std::runtime_error("rocfft_plan_get_work_buffer_size failed");
-        }
-
-        return fft_status_from_rocfftparams(fft_status);
-    }
-
-    fft_status create_plan() override
-    {
-        fft_status ret = setup_structs();
-        if(ret != fft_status_success)
-        {
-            return ret;
-        }
-        if(workbuffersize > 0)
-        {
-            hipError_t hip_status = hipSuccess;
-            hip_status            = wbuffer.alloc(workbuffersize);
-            if(hip_status != hipSuccess)
-            {
-                std::ostringstream oss;
-                oss << "work buffer allocation failed (" << workbuffersize << " requested)";
-                size_t mem_free  = 0;
-                size_t mem_total = 0;
-                hip_status       = hipMemGetInfo(&mem_free, &mem_total);
-                if(hip_status == hipSuccess)
-                {
-                    oss << "free vram: " << mem_free << " total vram: " << mem_total;
-                }
-                else
-                {
-                    oss << "hipMemGetInfo also failed";
-                }
-                throw work_buffer_alloc_failure(oss.str());
-            }
-
-            auto rocret
-                = rocfft_execution_info_set_work_buffer(info, wbuffer.data(), workbuffersize);
-            if(rocret != rocfft_status_success)
-            {
-                throw std::runtime_error("rocfft_execution_info_set_work_buffer failed");
-            }
-        }
-
-        return ret;
-    }
-
-    fft_status set_callbacks(void* load_cb_host,
-                             void* load_cb_data,
-                             void* store_cb_host,
-                             void* store_cb_data) override
-    {
-        if(run_callbacks)
-        {
-            auto roc_status
-                = rocfft_execution_info_set_load_callback(info, &load_cb_host, &load_cb_data, 0);
-            if(roc_status != rocfft_status_success)
-                return fft_status_from_rocfftparams(roc_status);
-
-            roc_status
-                = rocfft_execution_info_set_store_callback(info, &store_cb_host, &store_cb_data, 0);
-            if(roc_status != rocfft_status_success)
-                return fft_status_from_rocfftparams(roc_status);
-        }
-        return fft_status_success;
-    }
-
-    fft_status execute(void** in, void** out) override
-    {
-        auto ret = rocfft_execute(plan, in, out, info);
-        return fft_status_from_rocfftparams(ret);
-    }
-};
-
-#endif
diff -Nru rocfft-5.5.0/clients/samples/fixed-16/CMakeLists.txt rocfft-5.7.1/clients/samples/fixed-16/CMakeLists.txt
--- rocfft-5.5.0/clients/samples/fixed-16/CMakeLists.txt	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/samples/fixed-16/CMakeLists.txt	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 # #############################################################################
-# Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -52,7 +52,7 @@
   find_package( HIP REQUIRED )
 endif()
 
-set( sample_list fixed-16-float fixed-16-double )
+set( sample_list fixed-16-float fixed-16-double fixed-16-half )
 
 foreach( sample ${sample_list} )
 
@@ -63,13 +63,13 @@
             $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
   )
 
-  target_link_libraries( ${sample} PRIVATE roc::rocfft ${FFTW_LIBRARIES} )
+  target_link_libraries( ${sample} PRIVATE roc::rocfft hip::device ${FFTW_LIBRARIES} )
 
   target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} )
 
   set_target_properties( ${sample} PROPERTIES
     DEBUG_POSTFIX "-d"
-    CXX_STANDARD 14
+    CXX_STANDARD 17
     CXX_STANDARD_REQUIRED ON
   )
 
diff -Nru rocfft-5.5.0/clients/samples/fixed-16/fixed-16-double.cpp rocfft-5.7.1/clients/samples/fixed-16/fixed-16-double.cpp
--- rocfft-5.5.0/clients/samples/fixed-16/fixed-16-double.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/samples/fixed-16/fixed-16-double.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -24,7 +24,6 @@
 #include <hip/hip_runtime_api.h>
 #include <hip/hip_vector_types.h>
 #include <iostream>
-#include <math.h>
 #include <vector>
 
 int main()
@@ -43,57 +42,76 @@
     // rocfft gpu compute
     // ========================================
 
-    rocfft_setup();
+    if(rocfft_setup() != rocfft_status_success)
+        throw std::runtime_error("rocfft_setup failed.");
 
     size_t Nbytes = N * sizeof(double2);
 
     // Create HIP device object.
     double2* x;
-    hipMalloc(&x, Nbytes);
+    if(hipMalloc(&x, Nbytes) != hipSuccess)
+        throw std::runtime_error("hipMalloc failed.");
 
     //  Copy data to device
-    hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice);
+    if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess)
+        throw std::runtime_error("hipMemcpy failed.");
 
     // Create plan
     rocfft_plan plan   = NULL;
     size_t      length = N;
-    rocfft_plan_create(&plan,
-                       rocfft_placement_inplace,
-                       rocfft_transform_type_complex_forward,
-                       rocfft_precision_double,
-                       1,
-                       &length,
-                       1,
-                       NULL);
+    if(rocfft_plan_create(&plan,
+                          rocfft_placement_inplace,
+                          rocfft_transform_type_complex_forward,
+                          rocfft_precision_double,
+                          1,
+                          &length,
+                          1,
+                          NULL)
+       != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_create failed.");
 
     // Check if the plan requires a work buffer
     size_t work_buf_size = 0;
-    rocfft_plan_get_work_buffer_size(plan, &work_buf_size);
+    if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
     void*                 work_buf = nullptr;
     rocfft_execution_info info     = nullptr;
     if(work_buf_size)
     {
-        rocfft_execution_info_create(&info);
-        hipMalloc(&work_buf, work_buf_size);
-        rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size);
+        if(rocfft_execution_info_create(&info) != rocfft_status_success)
+            throw std::runtime_error("rocfft_execution_info_create failed.");
+        if(hipMalloc(&work_buf, work_buf_size) != hipSuccess)
+            throw std::runtime_error("hipMalloc failed.");
+        if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size)
+           != rocfft_status_success)
+            throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
     }
 
     // Execute plan
-    rocfft_execute(plan, (void**)&x, NULL, info);
+    if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success)
+        throw std::runtime_error("rocfft_execute failed.");
+    if(hipDeviceSynchronize() != hipSuccess)
+        throw std::runtime_error("hipDeviceSynchronize failed.");
 
     // Clean up work buffer
     if(work_buf_size)
     {
-        hipFree(work_buf);
-        rocfft_execution_info_destroy(info);
+        if(hipFree(work_buf) != hipSuccess)
+            throw std::runtime_error("hipFree failed.");
+        if(rocfft_execution_info_destroy(info) != rocfft_status_success)
+            throw std::runtime_error("rocfft_execution_info_destroy failed.");
+        info = nullptr;
     }
 
     // Destroy plan
-    rocfft_plan_destroy(plan);
+    if(rocfft_plan_destroy(plan) != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_destroy failed.");
+    plan = nullptr;
 
     // Copy result back to host
     std::vector<double2> y(N);
-    hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost);
+    if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess)
+        throw std::runtime_error("hipMemcpy failed.");
 
     for(size_t i = 0; i < N; i++)
     {
@@ -101,9 +119,11 @@
                   << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl;
     }
 
-    hipFree(x);
+    if(hipFree(x) != hipSuccess)
+        throw std::runtime_error("hipFree failed.");
 
-    rocfft_cleanup();
+    if(rocfft_cleanup() != rocfft_status_success)
+        throw std::runtime_error("rocfft_cleanup failed.");
 
     return 0;
 }
diff -Nru rocfft-5.5.0/clients/samples/fixed-16/fixed-16-float.cpp rocfft-5.7.1/clients/samples/fixed-16/fixed-16-float.cpp
--- rocfft-5.5.0/clients/samples/fixed-16/fixed-16-float.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/samples/fixed-16/fixed-16-float.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -19,19 +19,20 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *******************************************************************************/
+
 #include "rocfft.h"
 #include <hip/hip_runtime_api.h>
 #include <hip/hip_vector_types.h>
 #include <iostream>
-#include <math.h>
 #include <vector>
 
 int main()
 {
-    // For size N <= 4096
+
     const size_t N = 16;
 
     std::vector<float2> cx(N);
+
     for(size_t i = 0; i < N; i++)
     {
         cx[i].x = i + (i % 3) - (i % 7);
@@ -41,57 +42,76 @@
     // rocfft gpu compute
     // ========================================
 
-    rocfft_setup();
+    if(rocfft_setup() != rocfft_status_success)
+        throw std::runtime_error("rocfft_setup failed.");
 
     size_t Nbytes = N * sizeof(float2);
 
     // Create HIP device object.
     float2* x;
-    hipMalloc(&x, Nbytes);
+    if(hipMalloc(&x, Nbytes) != hipSuccess)
+        throw std::runtime_error("hipMalloc failed.");
 
     //  Copy data to device
-    hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice);
+    if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess)
+        throw std::runtime_error("hipMemcpy failed.");
 
     // Create plan
     rocfft_plan plan   = NULL;
     size_t      length = N;
-    rocfft_plan_create(&plan,
-                       rocfft_placement_inplace,
-                       rocfft_transform_type_complex_forward,
-                       rocfft_precision_single,
-                       1,
-                       &length,
-                       1,
-                       NULL);
+    if(rocfft_plan_create(&plan,
+                          rocfft_placement_inplace,
+                          rocfft_transform_type_complex_forward,
+                          rocfft_precision_single,
+                          1,
+                          &length,
+                          1,
+                          NULL)
+       != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_create failed.");
 
     // Check if the plan requires a work buffer
     size_t work_buf_size = 0;
-    rocfft_plan_get_work_buffer_size(plan, &work_buf_size);
+    if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
     void*                 work_buf = nullptr;
     rocfft_execution_info info     = nullptr;
     if(work_buf_size)
     {
-        rocfft_execution_info_create(&info);
-        hipMalloc(&work_buf, work_buf_size);
-        rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size);
+        if(rocfft_execution_info_create(&info) != rocfft_status_success)
+            throw std::runtime_error("rocfft_execution_info_create failed.");
+        if(hipMalloc(&work_buf, work_buf_size) != hipSuccess)
+            throw std::runtime_error("hipMalloc failed.");
+        if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size)
+           != rocfft_status_success)
+            throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
     }
 
     // Execute plan
-    rocfft_execute(plan, (void**)&x, NULL, NULL);
+    if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success)
+        throw std::runtime_error("rocfft_execute failed.");
+    if(hipDeviceSynchronize() != hipSuccess)
+        throw std::runtime_error("hipDeviceSynchronize failed.");
 
     // Clean up work buffer
     if(work_buf_size)
     {
-        hipFree(work_buf);
-        rocfft_execution_info_destroy(info);
+        if(hipFree(work_buf) != hipSuccess)
+            throw std::runtime_error("hipFree failed.");
+        if(rocfft_execution_info_destroy(info) != rocfft_status_success)
+            throw std::runtime_error("rocfft_execution_info_destroy failed.");
+        info = nullptr;
     }
 
     // Destroy plan
-    rocfft_plan_destroy(plan);
+    if(rocfft_plan_destroy(plan) != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_destroy failed.");
+    plan = nullptr;
 
     // Copy result back to host
     std::vector<float2> y(N);
-    hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost);
+    if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess)
+        throw std::runtime_error("hipMemcpy failed.");
 
     for(size_t i = 0; i < N; i++)
     {
@@ -99,9 +119,11 @@
                   << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl;
     }
 
-    hipFree(x);
+    if(hipFree(x) != hipSuccess)
+        throw std::runtime_error("hipFree failed.");
 
-    rocfft_cleanup();
+    if(rocfft_cleanup() != rocfft_status_success)
+        throw std::runtime_error("rocfft_cleanup failed.");
 
     return 0;
 }
diff -Nru rocfft-5.5.0/clients/samples/fixed-16/fixed-16-half.cpp rocfft-5.7.1/clients/samples/fixed-16/fixed-16-half.cpp
--- rocfft-5.5.0/clients/samples/fixed-16/fixed-16-half.cpp	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/clients/samples/fixed-16/fixed-16-half.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,131 @@
+/******************************************************************************
+* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*******************************************************************************/
+
+#include "rocfft.h"
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime_api.h>
+#include <iostream>
+#include <vector>
+
+int main()
+{
+
+    const size_t N = 16;
+
+    std::vector<_Float16_2> cx(N);
+
+    for(size_t i = 0; i < N; i++)
+    {
+        cx[i].x = static_cast<_Float16>(i + (i % 3) - (i % 7));
+        cx[i].y = 0;
+    }
+
+    // rocfft gpu compute
+    // ========================================
+
+    if(rocfft_setup() != rocfft_status_success)
+        throw std::runtime_error("rocfft_setup failed.");
+
+    size_t Nbytes = N * sizeof(_Float16_2);
+
+    // Create HIP device object.
+    _Float16_2* x = nullptr;
+    if(hipMalloc(&x, Nbytes) != hipSuccess)
+        throw std::runtime_error("hipMalloc failed.");
+
+    //  Copy data to device
+    if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess)
+        throw std::runtime_error("hipMemcpy failed.");
+
+    // Create plan
+    rocfft_plan plan   = NULL;
+    size_t      length = N;
+    if(rocfft_plan_create(&plan,
+                          rocfft_placement_inplace,
+                          rocfft_transform_type_complex_forward,
+                          rocfft_precision_half,
+                          1,
+                          &length,
+                          1,
+                          NULL)
+       != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_create failed.");
+
+    // Check if the plan requires a work buffer
+    size_t work_buf_size = 0;
+    if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
+    void*                 work_buf = nullptr;
+    rocfft_execution_info info     = nullptr;
+    if(work_buf_size)
+    {
+        if(rocfft_execution_info_create(&info) != rocfft_status_success)
+            throw std::runtime_error("rocfft_execution_info_create failed.");
+        if(hipMalloc(&work_buf, work_buf_size) != hipSuccess)
+            throw std::runtime_error("hipMalloc failed.");
+        if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size)
+           != rocfft_status_success)
+            throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
+    }
+
+    // Execute plan
+    if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success)
+        throw std::runtime_error("rocfft_execute failed.");
+    if(hipDeviceSynchronize() != hipSuccess)
+        throw std::runtime_error("hipDeviceSynchronize failed.");
+
+    // Clean up work buffer
+    if(work_buf_size)
+    {
+        if(hipFree(work_buf) != hipSuccess)
+            throw std::runtime_error("hipFree failed.");
+        if(rocfft_execution_info_destroy(info) != rocfft_status_success)
+            throw std::runtime_error("rocfft_execution_info_destroy failed.");
+        info = nullptr;
+    }
+
+    // Destroy plan
+    if(rocfft_plan_destroy(plan) != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_destroy failed.");
+    plan = nullptr;
+
+    // Copy result back to host
+    std::vector<_Float16_2> y(N);
+    if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess)
+        throw std::runtime_error("hipMemcpy failed.");
+
+    for(size_t i = 0; i < N; i++)
+    {
+        std::cout << "element " << i << " input:  (" << static_cast<double>(cx[i].x) << ","
+                  << static_cast<double>(cx[i].y) << ")"
+                  << " output: (" << static_cast<double>(y[i].x) << ","
+                  << static_cast<double>(y[i].y) << ")" << std::endl;
+    }
+
+    if(hipFree(x) != hipSuccess)
+        throw std::runtime_error("hipFree failed.");
+
+    if(rocfft_cleanup() != rocfft_status_success)
+        throw std::runtime_error("rocfft_cleanup failed.");
+
+    return 0;
+}
diff -Nru rocfft-5.5.0/clients/samples/fixed-large/CMakeLists.txt rocfft-5.7.1/clients/samples/fixed-large/CMakeLists.txt
--- rocfft-5.5.0/clients/samples/fixed-large/CMakeLists.txt	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/samples/fixed-large/CMakeLists.txt	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 # #############################################################################
-# Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -68,7 +68,7 @@
 
   set_target_properties( ${sample} PROPERTIES
     DEBUG_POSTFIX "-d"
-    CXX_STANDARD 14
+    CXX_STANDARD 17
     CXX_STANDARD_REQUIRED ON
   )
 
diff -Nru rocfft-5.5.0/clients/samples/fixed-large/fixed-large-double.cpp rocfft-5.7.1/clients/samples/fixed-large/fixed-large-double.cpp
--- rocfft-5.5.0/clients/samples/fixed-large/fixed-large-double.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/samples/fixed-large/fixed-large-double.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -44,60 +44,78 @@
     // rocfft gpu compute
     // ========================================
 
-    rocfft_setup();
+    if(rocfft_setup() != rocfft_status_success)
+        throw std::runtime_error("rocfft_setup failed.");
 
     size_t Nbytes = N * sizeof(double2);
 
     // Create HIP device object.
     double2* x;
-    hipMalloc(&x, Nbytes);
+    if(hipMalloc(&x, Nbytes) != hipSuccess)
+        throw std::runtime_error("hipMalloc failed.");
 
     //  Copy data to device
-    hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice);
+    if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess)
+        throw std::runtime_error("hipMemcpy failed.");
 
     // Create plan
-    rocfft_plan plan   = NULL;
+    rocfft_plan plan   = nullptr;
     size_t      length = N;
-    rocfft_plan_create(&plan,
-                       rocfft_placement_inplace,
-                       rocfft_transform_type_complex_forward,
-                       rocfft_precision_double,
-                       1,
-                       &length,
-                       1,
-                       NULL);
+    if(rocfft_plan_create(&plan,
+                          rocfft_placement_inplace,
+                          rocfft_transform_type_complex_forward,
+                          rocfft_precision_double,
+                          1,
+                          &length,
+                          1,
+                          nullptr)
+       != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_create failed.");
 
     // Setup work buffer
     void*  workBuffer     = nullptr;
     size_t workBufferSize = 0;
-    rocfft_plan_get_work_buffer_size(plan, &workBufferSize);
+    if(rocfft_plan_get_work_buffer_size(plan, &workBufferSize) != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
 
     // Setup exec info to pass work buffer to the library
     rocfft_execution_info info = nullptr;
-    rocfft_execution_info_create(&info);
+    if(rocfft_execution_info_create(&info) != rocfft_status_success)
+        throw std::runtime_error("rocfft_execution_info_create failed.");
 
     if(workBufferSize > 0)
     {
         printf("size of workbuffer=%d\n", (int)workBufferSize);
-        hipMalloc(&workBuffer, workBufferSize);
-        rocfft_execution_info_set_work_buffer(info, workBuffer, workBufferSize);
+        if(hipMalloc(&workBuffer, workBufferSize) != hipSuccess)
+            throw std::runtime_error("hipMalloc failed.");
+        if(rocfft_execution_info_set_work_buffer(info, workBuffer, workBufferSize)
+           != rocfft_status_success)
+            throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
     }
 
     // Execute plan
-    rocfft_execute(plan, (void**)&x, NULL, info);
-    hipDeviceSynchronize();
+    if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success)
+        throw std::runtime_error("rocfft_execute failed.");
+    if(hipDeviceSynchronize() != hipSuccess)
+        throw std::runtime_error("hipDeviceSynchronize failed.");
 
     // Destroy plan
-    rocfft_plan_destroy(plan);
+    if(rocfft_plan_destroy(plan) != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_destroy failed.");
+    plan = nullptr;
 
     if(workBuffer)
-        hipFree(workBuffer);
+        if(hipFree(workBuffer) != hipSuccess)
+            throw std::runtime_error("hipFree failed.");
 
-    rocfft_execution_info_destroy(info);
+    if(rocfft_execution_info_destroy(info) != rocfft_status_success)
+        throw std::runtime_error("rocfft_execution_info_destroy failed.");
+    info = nullptr;
 
     // Copy result back to host
     std::vector<double2> y(N);
-    hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost);
+    if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess)
+        throw std::runtime_error("hipMemcpy failed.");
 
     for(size_t i = 0; i < N; i++)
     {
@@ -105,7 +123,11 @@
                   << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl;
     }
 
-    rocfft_cleanup();
+    if(hipFree(x) != hipSuccess)
+        throw std::runtime_error("hipFree failed.");
+
+    if(rocfft_cleanup() != rocfft_status_success)
+        throw std::runtime_error("rocfft_cleanup failed.");
 
     return 0;
 }
diff -Nru rocfft-5.5.0/clients/samples/fixed-large/fixed-large-float.cpp rocfft-5.7.1/clients/samples/fixed-large/fixed-large-float.cpp
--- rocfft-5.5.0/clients/samples/fixed-large/fixed-large-float.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/samples/fixed-large/fixed-large-float.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -44,60 +44,78 @@
     // rocfft gpu compute
     // ========================================
 
-    rocfft_setup();
+    if(rocfft_setup() != rocfft_status_success)
+        throw std::runtime_error("rocfft_setup failed.");
 
     size_t Nbytes = N * sizeof(float2);
 
     // Create HIP device object.
     float2* x;
-    hipMalloc(&x, Nbytes);
+    if(hipMalloc(&x, Nbytes) != hipSuccess)
+        throw std::runtime_error("hipMalloc failed.");
 
     //  Copy data to device
-    hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice);
+    if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess)
+        throw std::runtime_error("hipMemcpy failed.");
 
     // Create plan
-    rocfft_plan plan   = NULL;
+    rocfft_plan plan   = nullptr;
     size_t      length = N;
-    rocfft_plan_create(&plan,
-                       rocfft_placement_inplace,
-                       rocfft_transform_type_complex_forward,
-                       rocfft_precision_single,
-                       1,
-                       &length,
-                       1,
-                       NULL);
+    if(rocfft_plan_create(&plan,
+                          rocfft_placement_inplace,
+                          rocfft_transform_type_complex_forward,
+                          rocfft_precision_single,
+                          1,
+                          &length,
+                          1,
+                          nullptr)
+       != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_create failed.");
 
     // Setup work buffer
     void*  workBuffer     = nullptr;
     size_t workBufferSize = 0;
-    rocfft_plan_get_work_buffer_size(plan, &workBufferSize);
+    if(rocfft_plan_get_work_buffer_size(plan, &workBufferSize) != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
 
     // Setup exec info to pass work buffer to the library
     rocfft_execution_info info = nullptr;
-    rocfft_execution_info_create(&info);
+    if(rocfft_execution_info_create(&info) != rocfft_status_success)
+        throw std::runtime_error("rocfft_execution_info_create failed.");
 
     if(workBufferSize > 0)
     {
         printf("size of workbuffer=%d\n", (int)workBufferSize);
-        hipMalloc(&workBuffer, workBufferSize);
-        rocfft_execution_info_set_work_buffer(info, workBuffer, workBufferSize);
+        if(hipMalloc(&workBuffer, workBufferSize) != hipSuccess)
+            throw std::runtime_error("hipMalloc failed.");
+        if(rocfft_execution_info_set_work_buffer(info, workBuffer, workBufferSize)
+           != rocfft_status_success)
+            throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
     }
 
     // Execute plan
-    rocfft_execute(plan, (void**)&x, NULL, info);
-    hipDeviceSynchronize();
+    if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success)
+        throw std::runtime_error("rocfft_execute failed.");
+    if(hipDeviceSynchronize() != hipSuccess)
+        throw std::runtime_error("hipDeviceSynchronize failed.");
 
     // Destroy plan
-    rocfft_plan_destroy(plan);
+    if(rocfft_plan_destroy(plan) != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_destroy failed.");
+    plan = nullptr;
 
     if(workBuffer)
-        hipFree(workBuffer);
+        if(hipFree(workBuffer) != hipSuccess)
+            throw std::runtime_error("hipFree failed.");
 
-    rocfft_execution_info_destroy(info);
+    if(rocfft_execution_info_destroy(info) != rocfft_status_success)
+        throw std::runtime_error("rocfft_execution_info_destroy failed.");
+    info = nullptr;
 
     // Copy result back to host
     std::vector<float2> y(N);
-    hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost);
+    if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess)
+        throw std::runtime_error("hipMemcpy failed.");
 
     for(size_t i = 0; i < N; i++)
     {
@@ -105,7 +123,11 @@
                   << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl;
     }
 
-    rocfft_cleanup();
+    if(hipFree(x) != hipSuccess)
+        throw std::runtime_error("hipFree failed.");
+
+    if(rocfft_cleanup() != rocfft_status_success)
+        throw std::runtime_error("rocfft_cleanup failed.");
 
     return 0;
 }
diff -Nru rocfft-5.5.0/clients/samples/rocfft/CMakeLists.txt rocfft-5.7.1/clients/samples/rocfft/CMakeLists.txt
--- rocfft-5.5.0/clients/samples/rocfft/CMakeLists.txt	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/samples/rocfft/CMakeLists.txt	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 # #############################################################################
-# Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -52,8 +52,8 @@
   find_package( HIP REQUIRED )
 endif()
 
-if( NOT rocrand_FOUND )
-  find_package( rocrand REQUIRED )
+if( NOT hiprand_FOUND )
+  find_package( hiprand REQUIRED )
 endif()
 
 find_package( Boost COMPONENTS program_options REQUIRED)
@@ -82,7 +82,7 @@
 target_link_libraries(
   ${sample}
   PRIVATE roc::rocfft
-  roc::rocrand
+  hip::hiprand
   ${Boost_LIBRARIES}
   )
 
@@ -90,7 +90,7 @@
 
   set_target_properties( ${sample} PROPERTIES
     DEBUG_POSTFIX "-d"
-    CXX_STANDARD 11
+    CXX_STANDARD 17
     CXX_STANDARD_REQUIRED ON
   )
 
diff -Nru rocfft-5.5.0/clients/samples/rocfft/examplekernels.h rocfft-5.7.1/clients/samples/rocfft/examplekernels.h
--- rocfft-5.5.0/clients/samples/rocfft/examplekernels.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/samples/rocfft/examplekernels.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -18,10 +18,11 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#ifndef __EXAMPLEKERNELS_H__
-#define __EXAMPLEKERNELS_H__
+#ifndef EXAMPLEKERNELS_H
+#define EXAMPLEKERNELS_H
 
-#include "../../data_gen.h"
+#include "../../../shared/data_gen.h"
+#include <hip/hip_complex.h>
 #include <hip/hip_runtime.h>
 #include <iostream>
 
@@ -69,42 +70,42 @@
 }
 
 // Kernel for initializing 1D complex data on the GPU.
-__global__ void initcdata1(std::complex<double>* x, const size_t Nx, const size_t xstride)
+__global__ void initcdata1(hipDoubleComplex* x, const size_t Nx, const size_t xstride)
 {
     const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if(idx < Nx)
     {
         const auto pos = idx * xstride;
-        x[pos].real(1 + idx);
-        x[pos].imag(1 + idx);
+        x[pos].x       = 1 + idx;
+        x[pos].y       = 1 + idx;
     }
 }
 
 // Kernel for initializing 2D complex input data on the GPU.
-__global__ void initcdata2(std::complex<double>* x,
-                           const size_t          Nx,
-                           const size_t          Ny,
-                           const size_t          xstride,
-                           const size_t          ystride)
+__global__ void initcdata2(hipDoubleComplex* x,
+                           const size_t      Nx,
+                           const size_t      Ny,
+                           const size_t      xstride,
+                           const size_t      ystride)
 {
     const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
     const auto idy = blockIdx.y * blockDim.y + threadIdx.y;
     if(idx < Nx && idy < Ny)
     {
         const auto pos = idx * xstride + idy * ystride;
-        x[pos].real(idx + 1);
-        x[pos].imag(idy + 1);
+        x[pos].x       = idx + 1;
+        x[pos].y       = idy + 1;
     }
 }
 
 // Kernel for initializing 3D complex input data on the GPU.
-__global__ void initcdata3(std::complex<double>* x,
-                           const size_t          Nx,
-                           const size_t          Ny,
-                           const size_t          Nz,
-                           const size_t          xstride,
-                           const size_t          ystride,
-                           const size_t          zstride)
+__global__ void initcdata3(hipDoubleComplex* x,
+                           const size_t      Nx,
+                           const size_t      Ny,
+                           const size_t      Nz,
+                           const size_t      xstride,
+                           const size_t      ystride,
+                           const size_t      zstride)
 {
     const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     const size_t idy = blockIdx.y * blockDim.y + threadIdx.y;
@@ -112,8 +113,8 @@
     if(idx < Nx && idy < Ny && idz < Nz)
     {
         const auto pos = idx * xstride + idy * ystride + idz * zstride;
-        x[pos].real(idx + 10.0 * idz + 1);
-        x[pos].imag(idy + 10);
+        x[pos].x       = idx + 10.0 * idz + 1;
+        x[pos].y       = idy + 10;
     }
 }
 
@@ -141,7 +142,7 @@
                            griddim,
                            0,
                            0,
-                           (std::complex<double>*)gpu_in,
+                           (hipDoubleComplex*)gpu_in,
                            length_cm[0],
                            stride_cm[0]);
         break;
@@ -155,7 +156,7 @@
                            griddim,
                            0,
                            0,
-                           (std::complex<double>*)gpu_in,
+                           (hipDoubleComplex*)gpu_in,
                            length_cm[0],
                            length_cm[1],
                            stride_cm[0],
@@ -173,7 +174,7 @@
                            griddim,
                            0,
                            0,
-                           (std::complex<double>*)gpu_in,
+                           (hipDoubleComplex*)gpu_in,
                            length_cm[0],
                            length_cm[1],
                            length_cm[2],
@@ -255,12 +256,12 @@
     {
     case 1:
     {
-        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1<double>,
+        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1<hipDoubleComplex>,
                            dim3(1),
                            dim3(1),
                            0,
                            0,
-                           (std::complex<double>*)gpu_in,
+                           (hipDoubleComplex*)gpu_in,
                            length[0],
                            stride[0],
                            1,
@@ -270,12 +271,12 @@
     }
     case 2:
     {
-        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2<double>,
+        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2<hipDoubleComplex>,
                            dim3(256),
                            dim3(ceildiv(ceildiv(ilength[1], 2), 256)),
                            0,
                            0,
-                           (std::complex<double>*)gpu_in,
+                           (hipDoubleComplex*)gpu_in,
                            length[0],
                            length[1],
                            stride[0],
@@ -288,12 +289,12 @@
     }
     case 3:
     {
-        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3<double>,
+        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3<hipDoubleComplex>,
                            dim3(64, 64),
                            dim3(ceildiv(ilength[1], 64), ceildiv(ceildiv(ilength[2], 2), 64)),
                            0,
                            0,
-                           (std::complex<double>*)gpu_in,
+                           (hipDoubleComplex*)gpu_in,
                            length[0],
                            length[1],
                            length[2],
@@ -326,20 +327,14 @@
     {
         const dim3 blockdim(256);
         const dim3 griddim(ceildiv(ilength[0], blockdim.x));
-        hipLaunchKernelGGL(initcdata1,
-                           blockdim,
-                           griddim,
-                           0,
-                           0,
-                           (std::complex<double>*)gpu_in,
-                           ilength[0],
-                           stride[0]);
-        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1<double>,
+        hipLaunchKernelGGL(
+            initcdata1, blockdim, griddim, 0, 0, (hipDoubleComplex*)gpu_in, ilength[0], stride[0]);
+        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1<hipDoubleComplex>,
                            dim3(1),
                            dim3(1),
                            0,
                            0,
-                           (std::complex<double>*)gpu_in,
+                           (hipDoubleComplex*)gpu_in,
                            length[0],
                            stride[0],
                            1,
@@ -356,17 +351,17 @@
                            griddim,
                            0,
                            0,
-                           (std::complex<double>*)gpu_in,
+                           (hipDoubleComplex*)gpu_in,
                            ilength[0],
                            ilength[1],
                            stride[0],
                            stride[1]);
-        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2<double>,
+        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2<hipDoubleComplex>,
                            dim3(256),
                            dim3(ceildiv(ceildiv(ilength[1], 2), 256)),
                            0,
                            0,
-                           (std::complex<double>*)gpu_in,
+                           (hipDoubleComplex*)gpu_in,
                            length[0],
                            length[1],
                            stride[0],
@@ -389,7 +384,7 @@
                            griddim,
                            0,
                            0,
-                           (std::complex<double>*)gpu_in,
+                           (hipDoubleComplex*)gpu_in,
                            ilength[0],
                            ilength[1],
                            ilength[2],
@@ -397,12 +392,12 @@
                            stride[1],
                            stride[2]);
 
-        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3<double>,
+        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3<hipDoubleComplex>,
                            dim3(64, 64),
                            dim3(ceildiv(ilength[1], 64), ceildiv(ceildiv(ilength[2], 2), 64)),
                            0,
                            0,
-                           (std::complex<double>*)gpu_in,
+                           (hipDoubleComplex*)gpu_in,
                            length[0],
                            length[1],
                            length[2],
@@ -423,4 +418,4 @@
     impose_hermitian_symmetry_cm(length, ilength, stride, gpu_in);
 }
 
-#endif
+#endif /* EXAMPLEKERNELS_H */
diff -Nru rocfft-5.5.0/clients/samples/rocfft/exampleutils.h rocfft-5.7.1/clients/samples/rocfft/exampleutils.h
--- rocfft-5.5.0/clients/samples/rocfft/exampleutils.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/samples/rocfft/exampleutils.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -18,8 +18,14 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#ifndef __EXAMPLEUTILS_H__
-#define __EXAMPLEUTILS_H__
+#ifndef EXAMPLEUTILS_H
+#define EXAMPLEUTILS_H
+
+std::ostream& operator<<(std::ostream& stream, hipDoubleComplex c)
+{
+    stream << "(" << c.x << "," << c.y << ")";
+    return stream;
+}
 
 // Increment the index (column-major) for looping over arbitrary dimensional loops with
 // dimensions length.
@@ -60,7 +66,9 @@
             const auto i = std::inner_product(index.begin(), index.end(), stride.begin(), b * dist);
             assert(i >= 0);
             assert(i < data.size());
+
             std::cout << data[i] << " ";
+
             for(size_t idx = 0; idx < index.size(); ++idx)
             {
                 if(index[idx] == (length[idx] - 1))
@@ -80,13 +88,13 @@
 // Check that an multi-dimensional array of complex values with dimensions length
 // and straide stride, with nbatch copies separated by dist is Hermitian-symmetric.
 // Column-major version.
-template <class Tfloat, class Tint1, class Tint2>
-bool check_symmetry_cm(const std::vector<std::complex<Tfloat>>& data,
-                       const std::vector<Tint1>&                length_cm,
-                       const std::vector<Tint2>&                stride_cm,
-                       const size_t                             nbatch,
-                       const size_t                             dist,
-                       const bool                               verbose = true)
+template <class Tcomplex, class Tint1, class Tint2>
+bool check_symmetry_cm(const std::vector<Tcomplex>& data,
+                       const std::vector<Tint1>&    length_cm,
+                       const std::vector<Tint2>&    stride_cm,
+                       const size_t                 nbatch,
+                       const size_t                 dist,
+                       const bool                   verbose = true)
 {
     bool issymmetric = true;
     for(size_t b = 0; b < nbatch; b++)
@@ -118,7 +126,7 @@
                     = std::inner_product(index.begin(), index.end(), stride_cm.begin(), b * dist);
                 const auto j = std::inner_product(
                     negindex.begin(), negindex.end(), stride_cm.begin(), b * dist);
-                if(data[i] != std::conj(data[j]))
+                if((data[i].x != data[j].x) or (data[i].y != -data[j].y))
                 {
                     if(verbose)
                     {
@@ -152,4 +160,4 @@
     return issymmetric;
 }
 
-#endif
+#endif /* EXAMPLEUTILS_H */
diff -Nru rocfft-5.5.0/clients/samples/rocfft/rocfft_example_callback.cpp rocfft-5.7.1/clients/samples/rocfft/rocfft_example_callback.cpp
--- rocfft-5.5.0/clients/samples/rocfft/rocfft_example_callback.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/samples/rocfft/rocfft_example_callback.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -26,6 +26,7 @@
 #include <hip/hip_vector_types.h>
 #include <iostream>
 #include <math.h>
+#include <stdexcept>
 #include <vector>
 
 // example of using load/store callbacks with rocfft
@@ -63,77 +64,113 @@
     }
 
     // rocfft gpu compute
-    // ========================================
+    // ==================
 
-    rocfft_setup();
+    if(rocfft_setup() != rocfft_status_success)
+        throw std::runtime_error("rocfft_setup failed.");
 
     size_t Nbytes = N * sizeof(double2);
 
     // Create HIP device object.
     double2 *x, *filter_dev;
-    hipMalloc(&x, Nbytes);
-    hipMalloc(&filter_dev, Nbytes);
+
+    // create buffers
+    if(hipMalloc(&x, Nbytes) != hipSuccess)
+        throw std::runtime_error("hipMalloc failed.");
+
+    if(hipMalloc(&filter_dev, Nbytes) != hipSuccess)
+        throw std::runtime_error("hipMalloc failed.");
 
     //  Copy data to device
-    hipMemcpy(x, cx.data(), Nbytes, hipMemcpyHostToDevice);
-    hipMemcpy(filter_dev, filter.data(), Nbytes, hipMemcpyHostToDevice);
+    hipError_t hip_status = hipMemcpy(x, cx.data(), Nbytes, hipMemcpyHostToDevice);
+    if(hip_status != hipSuccess)
+        throw std::runtime_error("hipMemcpy failed.");
+
+    hip_status = hipMemcpy(filter_dev, filter.data(), Nbytes, hipMemcpyHostToDevice);
+    if(hip_status != hipSuccess)
+        throw std::runtime_error("hipMemcpy failed.");
 
     // Create plan
-    rocfft_plan plan   = NULL;
+    rocfft_plan plan   = nullptr;
     size_t      length = N;
-    rocfft_plan_create(&plan,
-                       rocfft_placement_inplace,
-                       rocfft_transform_type_complex_forward,
-                       rocfft_precision_double,
-                       1,
-                       &length,
-                       1,
-                       NULL);
+    if(rocfft_plan_create(&plan,
+                          rocfft_placement_inplace,
+                          rocfft_transform_type_complex_forward,
+                          rocfft_precision_double,
+                          1,
+                          &length,
+                          1,
+                          nullptr)
+       != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_create failed.");
 
     // Check if the plan requires a work buffer
     size_t work_buf_size = 0;
-    rocfft_plan_get_work_buffer_size(plan, &work_buf_size);
+    if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
     void*                 work_buf = nullptr;
     rocfft_execution_info info     = nullptr;
-    rocfft_execution_info_create(&info);
+    if(rocfft_execution_info_create(&info) != rocfft_status_success)
+        throw std::runtime_error("rocfft_execution_info_create failed.");
     if(work_buf_size)
     {
-        hipMalloc(&work_buf, work_buf_size);
-        rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size);
+        if(hipMalloc(&work_buf, work_buf_size) != hipSuccess)
+            throw std::runtime_error("hipMalloc failed.");
+
+        if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size)
+           != rocfft_status_success)
+            throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
     }
 
-    // prepare callback
+    // Prepare callback
     load_cbdata cbdata_host;
     cbdata_host.filter = filter_dev;
     cbdata_host.scale  = 1.0 / static_cast<double>(N);
+
     void* cbdata_dev;
-    hipMalloc(&cbdata_dev, sizeof(load_cbdata));
-    hipMemcpy(cbdata_dev, &cbdata_host, sizeof(load_cbdata), hipMemcpyHostToDevice);
+    if(hipMalloc(&cbdata_dev, sizeof(load_cbdata)) != hipSuccess)
+        throw std::runtime_error("hipMalloc failed.");
+
+    hip_status = hipMemcpy(cbdata_dev, &cbdata_host, sizeof(load_cbdata), hipMemcpyHostToDevice);
+    if(hip_status != hipSuccess)
+        throw std::runtime_error("hipMemcpy failed.");
 
-    // get a properly-typed host pointer to the device function, as
+    // Get a properly-typed host pointer to the device function, as
     // rocfft_execution_info_set_load_callback expects void*.
     void* cbptr_host = nullptr;
-    hipMemcpyFromSymbol(&cbptr_host, HIP_SYMBOL(load_callback_dev), sizeof(void*));
+    hip_status = hipMemcpyFromSymbol(&cbptr_host, HIP_SYMBOL(load_callback_dev), sizeof(void*));
+    if(hip_status != hipSuccess)
+        throw std::runtime_error("hipMemcpyFromSymbol failed.");
 
     // set callback
-    rocfft_execution_info_set_load_callback(info, &cbptr_host, &cbdata_dev, 0);
+    if(rocfft_execution_info_set_load_callback(info, &cbptr_host, &cbdata_dev, 0)
+       != rocfft_status_success)
+        throw std::runtime_error("rocfft_execution_info_set_load_callback failed.");
 
     // Execute plan
-    rocfft_execute(plan, (void**)&x, NULL, info);
+    if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success)
+        throw std::runtime_error("rocfft_execute failed.");
 
     // Clean up work buffer
     if(work_buf_size)
     {
-        hipFree(work_buf);
-        rocfft_execution_info_destroy(info);
+        if(hipFree(work_buf) != hipSuccess)
+            throw std::runtime_error("hipFree failed.");
+        if(rocfft_execution_info_destroy(info) != rocfft_status_success)
+            throw std::runtime_error("rocfft_execution_info_destroy failed.");
+        info = nullptr;
     }
 
     // Destroy plan
-    rocfft_plan_destroy(plan);
+    if(rocfft_plan_destroy(plan) != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_destroy failed.");
+    plan = nullptr;
 
     // Copy result back to host
     std::vector<double2> y(N);
-    hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost);
+    hip_status = hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost);
+    if(hip_status != hipSuccess)
+        throw std::runtime_error("hipMemcpy failed.");
 
     for(size_t i = 0; i < N; i++)
     {
@@ -141,11 +178,15 @@
                   << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl;
     }
 
-    hipFree(cbdata_dev);
-    hipFree(filter_dev);
-    hipFree(x);
+    if(hipFree(cbdata_dev) != hipSuccess)
+        throw std::runtime_error("hipFree failed.");
+    if(hipFree(filter_dev) != hipSuccess)
+        throw std::runtime_error("hipFree failed.");
+    if(hipFree(x) != hipSuccess)
+        throw std::runtime_error("hipFree failed.");
 
-    rocfft_cleanup();
+    if(rocfft_cleanup() != rocfft_status_success)
+        throw std::runtime_error("rocfft_cleanup failed.");
 
     return 0;
 }
diff -Nru rocfft-5.5.0/clients/samples/rocfft/rocfft_example_complexcomplex.cpp rocfft-5.7.1/clients/samples/rocfft/rocfft_example_complexcomplex.cpp
--- rocfft-5.5.0/clients/samples/rocfft/rocfft_example_complexcomplex.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/samples/rocfft/rocfft_example_complexcomplex.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -32,6 +32,7 @@
 
 #include "examplekernels.h"
 #include "exampleutils.h"
+#include <stdexcept>
 
 int main(int argc, char* argv[])
 {
@@ -63,7 +64,8 @@
     }
 
     // Placeness for the transform
-    rocfft_setup();
+    if(rocfft_setup() != rocfft_status_success)
+        throw std::runtime_error("rocfft_setup failed.");
     const rocfft_result_placement place
         = vm.count("outofplace") ? rocfft_placement_notinplace : rocfft_placement_inplace;
     const bool inplace = place == rocfft_placement_inplace;
@@ -117,26 +119,34 @@
     std::cout << std::endl;
 
     // Set the device:
-    hipSetDevice(deviceId);
+    if(hipSetDevice(deviceId) != hipSuccess)
+        throw std::runtime_error("hipSetDevice failed.");
 
-    // Create HIP device object and copy data to device
-    double2* gpu_in = NULL;
-    hipMalloc(&gpu_in, isize * sizeof(std::complex<double>));
+    // Create HIP device object and allocate data
+    hipDoubleComplex* gpu_in = nullptr;
+    if(hipMalloc(&gpu_in, isize * sizeof(hipDoubleComplex)) != hipSuccess)
+        throw std::runtime_error("hipMalloc failed.");
 
     // Inititalize the data on the device
     initcomplex_cm(length, istride, gpu_in);
-    hipDeviceSynchronize();
+    if(hipDeviceSynchronize() != hipSuccess)
+        throw std::runtime_error("hipDeviceSynchronize failed.");
+
     hipError_t hip_status = hipGetLastError();
     if(hip_status != hipSuccess)
         throw std::runtime_error("device error");
 
     std::cout << "input:\n";
-    std::vector<std::complex<double>> idata(isize);
-    hipMemcpy(idata.data(), gpu_in, isize * sizeof(std::complex<double>), hipMemcpyDefault);
+    std::vector<hipDoubleComplex> idata(isize);
+    hip_status
+        = hipMemcpy(idata.data(), gpu_in, isize * sizeof(hipDoubleComplex), hipMemcpyDefault);
+    if(hip_status != hipSuccess)
+        throw std::runtime_error("hipMemcpy failed.");
+
     printbuffer_cm(idata, length, istride, 1, isize);
 
     // Create the a descrition struct to set data layout:
-    rocfft_plan_description gpu_description = NULL;
+    rocfft_plan_description gpu_description = nullptr;
     // rocfft_status can be used to capture API status info
     rocfft_status rc = rocfft_plan_description_create(&gpu_description);
     if(rc != rocfft_status_success)
@@ -144,8 +154,8 @@
     rc = rocfft_plan_description_set_data_layout(gpu_description,
                                                  rocfft_array_type_complex_interleaved,
                                                  rocfft_array_type_complex_interleaved,
-                                                 NULL,
-                                                 NULL,
+                                                 nullptr,
+                                                 nullptr,
                                                  istride.size(), // input stride length
                                                  istride.data(), // input stride data
                                                  0, // input batch distance
@@ -154,12 +164,12 @@
                                                  0); // ouptut batch distance
     if(rc != rocfft_status_success)
         throw std::runtime_error("failed to set data layout");
-    // We can also pass "NULL" instead of a description; rocFFT will use reasonable
+    // We can also pass "nullptr" instead of a description; rocFFT will use reasonable
     // default parameters.  If the data isn't contiguous, we need to set strides, etc,
     // using the description.
 
     // Create the plan
-    rocfft_plan gpu_plan = NULL;
+    rocfft_plan gpu_plan = nullptr;
     rc                   = rocfft_plan_create(&gpu_plan,
                             place,
                             direction,
@@ -172,7 +182,7 @@
         throw std::runtime_error("failed to create plan");
 
     // Get the execution info for the fft plan (in particular, work memory requirements):
-    rocfft_execution_info planinfo = NULL;
+    rocfft_execution_info planinfo = nullptr;
     rc                             = rocfft_execution_info_create(&planinfo);
     if(rc != rocfft_status_success)
         throw std::runtime_error("failed to create execution info");
@@ -182,24 +192,24 @@
         throw std::runtime_error("failed to get work buffer size");
 
     // If the transform requires work memory, allocate a work buffer:
-    void* wbuffer = NULL;
+    void* wbuffer = nullptr;
     if(workbuffersize > 0)
     {
         hip_status = hipMalloc(&wbuffer, workbuffersize);
         if(hip_status != hipSuccess)
-            throw std::runtime_error("hipMalloc failed");
+            throw std::runtime_error("hipMalloc failed.");
         rc = rocfft_execution_info_set_work_buffer(planinfo, wbuffer, workbuffersize);
         if(rc != rocfft_status_success)
-            throw std::runtime_error("failed to set work buffer");
+            throw std::runtime_error("failed to set work buffer.");
     }
 
     // If the transform is out-of-place, allocate the output buffer as well:
-    double2* gpu_out = inplace ? gpu_in : NULL;
+    double2* gpu_out = inplace ? gpu_in : nullptr;
     if(!inplace)
     {
-        hip_status = hipMalloc(&gpu_out, osize * sizeof(std::complex<double>));
+        hip_status = hipMalloc(&gpu_out, osize * sizeof(hipDoubleComplex));
         if(hip_status != hipSuccess)
-            throw std::runtime_error("hipMalloc failed");
+            throw std::runtime_error("hipMalloc failed.");
     }
 
     // Execute the GPU transform:
@@ -208,30 +218,45 @@
                         (void**)&gpu_out, // out_buffer
                         planinfo); // execution info
     if(rc != rocfft_status_success)
-        throw std::runtime_error("failed to execute");
+        throw std::runtime_error("failed to execute.");
 
     // Get the output from the device and print to cout:
     std::cout << "output:\n";
-    std::vector<std::complex<double>> odata(osize);
-    hipMemcpy(odata.data(), gpu_out, osize * sizeof(std::complex<double>), hipMemcpyDeviceToHost);
+    std::vector<hipDoubleComplex> odata(osize);
+    hip_status
+        = hipMemcpy(odata.data(), gpu_out, osize * sizeof(hipDoubleComplex), hipMemcpyDeviceToHost);
+    if(hip_status != hipSuccess)
+        throw std::runtime_error("hipMemcpy failed.");
+
     printbuffer_cm(odata, length, istride, 1, isize);
 
     // Clean up: free GPU memory:
-    hipFree(gpu_in);
+    if(hipFree(gpu_in) != hipSuccess)
+        throw std::runtime_error("hipFree failed.");
+
     if(!inplace)
     {
-        hipFree(gpu_out);
+        if(hipFree(gpu_out) != hipSuccess)
+            throw std::runtime_error("hipFree failed.");
     }
-    if(wbuffer != NULL)
+    if(wbuffer != nullptr)
     {
-        hipFree(wbuffer);
+        if(hipFree(wbuffer) != hipSuccess)
+            throw std::runtime_error("hipFree failed.");
     }
 
     // Clean up: destroy plans:
-    rocfft_execution_info_destroy(planinfo);
-    rocfft_plan_description_destroy(gpu_description);
-    rocfft_plan_destroy(gpu_plan);
+    if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success)
+        throw std::runtime_error("rocfft_execution_info_destroy failed.");
+    planinfo = nullptr;
+    if(rocfft_plan_description_destroy(gpu_description) != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_description_destroy failed.");
+    gpu_description = nullptr;
+    if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_destroy failed.");
+    gpu_plan = nullptr;
 
-    rocfft_cleanup();
+    if(rocfft_cleanup() != rocfft_status_success)
+        throw std::runtime_error("rocfft_cleanup failed.");
     return 0;
 }
diff -Nru rocfft-5.5.0/clients/samples/rocfft/rocfft_example_realcomplex.cpp rocfft-5.7.1/clients/samples/rocfft/rocfft_example_realcomplex.cpp
--- rocfft-5.5.0/clients/samples/rocfft/rocfft_example_realcomplex.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/samples/rocfft/rocfft_example_realcomplex.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -32,6 +32,7 @@
 
 #include "examplekernels.h"
 #include "exampleutils.h"
+#include <stdexcept>
 
 int main(int argc, char* argv[])
 {
@@ -63,7 +64,8 @@
     }
 
     // Placeness for the transform
-    rocfft_setup();
+    if(rocfft_setup() != rocfft_status_success)
+        throw std::runtime_error("rocfft_setup failed.");
     const rocfft_result_placement place
         = vm.count("outofplace") ? rocfft_placement_notinplace : rocfft_placement_inplace;
     const bool inplace = place == rocfft_placement_inplace;
@@ -97,16 +99,16 @@
         cstride.push_back(clength[i - 1] * cstride[i - 1]);
     }
     const size_t complex_size = clength[clength.size() - 1] * cstride[cstride.size() - 1];
-    std::vector<std::complex<double>> cdata(complex_size); // host storage
+    std::vector<hipDoubleComplex> cdata(complex_size); // host storage
 
     // Based on the direction, we set the input and output parameters appropriately.
     const size_t isize  = forward ? real_size : complex_size;
-    const size_t ibytes = isize * (forward ? sizeof(double) : sizeof(std::complex<double>));
+    const size_t ibytes = isize * (forward ? sizeof(double) : sizeof(hipDoubleComplex));
     const std::vector<size_t> ilength = forward ? length : clength;
     const std::vector<size_t> istride = forward ? rstride : cstride;
 
     const size_t osize  = forward ? complex_size : real_size;
-    const size_t obytes = osize * (forward ? sizeof(std::complex<double>) : sizeof(double));
+    const size_t obytes = osize * (forward ? sizeof(hipDoubleComplex) : sizeof(double));
     const std::vector<size_t> olength = forward ? clength : length;
     const std::vector<size_t> ostride = forward ? cstride : rstride;
 
@@ -147,11 +149,12 @@
     std::cout << std::endl;
 
     // Set the device:
-    hipSetDevice(deviceId);
+    if(hipSetDevice(deviceId) != hipSuccess)
+        throw std::runtime_error("hipSetDevice failed.");
 
     // Create HIP device object and initialize data
     // Kernels are provided in examplekernels.h
-    void*      gpu_in     = NULL;
+    void*      gpu_in     = nullptr;
     hipError_t hip_status = hipMalloc(&gpu_in, inplace ? std::max(ibytes, obytes) : ibytes);
     if(hip_status != hipSuccess)
         throw std::runtime_error("device error");
@@ -169,12 +172,16 @@
     std::cout << "input:\n";
     if(forward)
     {
-        hipMemcpy(rdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost);
+        hip_status = hipMemcpy(rdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost);
+        if(hip_status != hipSuccess)
+            throw std::runtime_error("hipMemcpy failed.");
         printbuffer_cm(rdata, ilength, istride, 1, isize);
     }
     else
     {
-        hipMemcpy(cdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost);
+        hip_status = hipMemcpy(cdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost);
+        if(hip_status != hipSuccess)
+            throw std::runtime_error("hipMemcpy failed.");
         printbuffer_cm(cdata, ilength, istride, 1, isize);
 
         // Check that the buffer is Hermitian symmetric:
@@ -185,7 +192,7 @@
     rocfft_status rc = rocfft_status_success;
 
     // Create the a descrition struct to set data layout:
-    rocfft_plan_description gpu_description = NULL;
+    rocfft_plan_description gpu_description = nullptr;
     rc                                      = rocfft_plan_description_create(&gpu_description);
     if(rc != rocfft_status_success)
         throw std::runtime_error("failed to create plan description");
@@ -196,8 +203,8 @@
         forward ? rocfft_array_type_real : rocfft_array_type_hermitian_interleaved,
         // output data format:
         forward ? rocfft_array_type_hermitian_interleaved : rocfft_array_type_real,
-        NULL,
-        NULL,
+        nullptr,
+        nullptr,
         istride.size(), // input stride length
         istride.data(), // input stride data
         0, // input batch distance
@@ -207,12 +214,12 @@
     if(rc != rocfft_status_success)
         throw std::runtime_error("failed to set data layout");
 
-    // We can also pass "NULL" instead of a description; rocFFT will use reasonable
+    // We can also pass "nullptr" instead of a description; rocFFT will use reasonable
     // default parameters.  If the data isn't contiguous, we need to set strides, etc,
     // using the description.
 
     // Create the FFT plan:
-    rocfft_plan gpu_plan = NULL;
+    rocfft_plan gpu_plan = nullptr;
     rc                   = rocfft_plan_create(&gpu_plan,
                             place,
                             direction,
@@ -225,7 +232,7 @@
         throw std::runtime_error("failed to create plan");
 
     // Get the execution info for the fft plan (in particular, work memory requirements):
-    rocfft_execution_info planinfo = NULL;
+    rocfft_execution_info planinfo = nullptr;
     rc                             = rocfft_execution_info_create(&planinfo);
     if(rc != rocfft_status_success)
         throw std::runtime_error("failed to create execution info");
@@ -236,7 +243,7 @@
         throw std::runtime_error("failed to get work buffer size");
 
     // If the transform requires work memory, allocate a work buffer:
-    void* wbuffer = NULL;
+    void* wbuffer = nullptr;
     if(workbuffersize > 0)
     {
         hip_status = hipMalloc(&wbuffer, workbuffersize);
@@ -249,7 +256,7 @@
     }
 
     // If the transform is out-of-place, allocate the output buffer as well:
-    void* gpu_out = inplace ? gpu_in : NULL;
+    void* gpu_out = inplace ? gpu_in : nullptr;
     if(!inplace)
     {
         hip_status = hipMalloc(&gpu_out, obytes);
@@ -269,30 +276,44 @@
     std::cout << "output:\n";
     if(forward)
     {
-        hipMemcpy(cdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost);
+        hip_status = hipMemcpy(cdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost);
+        if(hip_status != hipSuccess)
+            throw std::runtime_error("hipMemcpy failed.");
         printbuffer_cm(cdata, olength, ostride, 1, osize);
     }
     else
     {
-        hipMemcpy(rdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost);
+        hip_status = hipMemcpy(rdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost);
+        if(hip_status != hipSuccess)
+            throw std::runtime_error("hipMemcpy failed.");
         printbuffer_cm(rdata, olength, ostride, 1, osize);
     }
 
     // Clean up: free GPU memory:
-    hipFree(gpu_in);
+    if(hipFree(gpu_in) != hipSuccess)
+        throw std::runtime_error("hipFree failed.");
+
     if(!inplace)
     {
-        hipFree(gpu_out);
+        if(hipFree(gpu_out) != hipSuccess)
+            throw std::runtime_error("hipFree failed.");
     }
-    if(wbuffer != NULL)
+    if(wbuffer != nullptr)
     {
-        hipFree(wbuffer);
+        if(hipFree(wbuffer) != hipSuccess)
+            throw std::runtime_error("hipFree failed.");
     }
 
     // Clean up: destroy plans:
-    rocfft_execution_info_destroy(planinfo);
-    rocfft_plan_description_destroy(gpu_description);
-    rocfft_plan_destroy(gpu_plan);
+    if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success)
+        throw std::runtime_error("rocfft_execution_info_destroy failed.");
+    planinfo = nullptr;
+    if(rocfft_plan_description_destroy(gpu_description) != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_description_destroy failed.");
+    gpu_description = nullptr;
+    if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_destroy failed.");
+    gpu_plan = nullptr;
 
     rocfft_cleanup();
     return 0;
diff -Nru rocfft-5.5.0/clients/samples/rocfft/rocfft_example_set_stream.cpp rocfft-5.7.1/clients/samples/rocfft/rocfft_example_set_stream.cpp
--- rocfft-5.5.0/clients/samples/rocfft/rocfft_example_set_stream.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/samples/rocfft/rocfft_example_set_stream.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -22,103 +22,120 @@
 #include <cassert>
 #include <hip/hip_runtime_api.h>
 #include <iostream>
-
-#define CHECK_HIP_ERR(err)                                    \
-    if(err != hipSuccess)                                     \
-    {                                                         \
-        std::cerr << "hip error code : " << err << std::endl; \
-        exit(-1);                                             \
-    }
-
-#define CHECK_ROCFFT_ERR(err)                                    \
-    if(err != rocfft_status_success)                             \
-    {                                                            \
-        std::cerr << "rocFFT error code : " << err << std::endl; \
-        exit(-1);                                                \
-    }
+#include <stdexcept>
+#include <vector>
 
 struct fft_fixture_t
 {
-    double2*              cpu_buf;
-    double2*              gpu_buf;
-    hipStream_t           stream;
-    rocfft_execution_info info;
-    rocfft_plan           plan;
+    std::vector<double2>  cpu_buf;
+    double2*              gpu_buf = nullptr;
+    hipStream_t           stream  = nullptr;
+    rocfft_execution_info info    = nullptr;
+    rocfft_plan           plan    = nullptr;
 };
 
 int main(int argc, char* argv[])
 {
     std::cout << "rocfft example of 2 inplace transforms with 2 streams.\n" << std::endl;
 
-    size_t length      = 8;
-    size_t total_bytes = length * sizeof(double2);
+    size_t        length      = 8;
+    size_t        total_bytes = length * sizeof(double2);
+    hipError_t    hip_status;
+    rocfft_status fft_status;
 
     fft_fixture_t ffts[2];
 
     /// preparation
-    rocfft_setup();
+    if(rocfft_setup() != rocfft_status_success)
+        throw std::runtime_error("rocfft_setup failed.");
     for(auto& it : ffts)
     {
         // create cpu buffer
-        it.cpu_buf = new double2[length];
+        it.cpu_buf.resize(length);
 
         // init cpu buffer...
 
         // create gpu buffer
-        CHECK_HIP_ERR(hipMalloc(&(it.gpu_buf), total_bytes));
+        if(hipMalloc(&(it.gpu_buf), total_bytes) != hipSuccess)
+            throw std::runtime_error("hipMalloc failed.");
 
         // copy host to device
-        CHECK_HIP_ERR(hipMemcpy(it.gpu_buf, it.cpu_buf, total_bytes, hipMemcpyHostToDevice));
+        if(hipMemcpy(it.gpu_buf, it.cpu_buf.data(), total_bytes, hipMemcpyHostToDevice)
+           != hipSuccess)
+            throw std::runtime_error("hipMemcpy failed.");
 
         // create stream
-        CHECK_HIP_ERR(hipStreamCreate(&(it.stream)));
+        if(hipStreamCreate(&(it.stream)) != hipSuccess)
+            throw std::runtime_error("hipStreamCreate failed.");
 
         // create execution info
-        CHECK_ROCFFT_ERR(rocfft_execution_info_create(&(it.info)));
+        fft_status = rocfft_execution_info_create(&(it.info));
+        if(fft_status != rocfft_status_success)
+            throw std::runtime_error("rocfft_execution_info_create failed.");
 
         // set stream
         // NOTE: The stream must be of type hipStream_t.
         // It is an error to pass the address of a hipStream_t object.
-        CHECK_ROCFFT_ERR(rocfft_execution_info_set_stream(it.info, it.stream));
+        fft_status = rocfft_execution_info_set_stream(it.info, it.stream);
+        if(fft_status != rocfft_status_success)
+            throw std::runtime_error("rocfft_execution_info_set_stream failed.");
 
         // create plan
-        CHECK_ROCFFT_ERR(rocfft_plan_create(&it.plan,
-                                            rocfft_placement_inplace,
-                                            rocfft_transform_type_complex_forward,
-                                            rocfft_precision_double,
-                                            1,
-                                            &length,
-                                            1,
-                                            nullptr));
+        fft_status = rocfft_plan_create(&it.plan,
+                                        rocfft_placement_inplace,
+                                        rocfft_transform_type_complex_forward,
+                                        rocfft_precision_double,
+                                        1,
+                                        &length,
+                                        1,
+                                        nullptr);
+        if(fft_status != rocfft_status_success)
+            throw std::runtime_error("rocfft_plan_create failed.");
+
         size_t work_buf_size = 0;
-        CHECK_ROCFFT_ERR(rocfft_plan_get_work_buffer_size(it.plan, &work_buf_size));
+        fft_status           = rocfft_plan_get_work_buffer_size(it.plan, &work_buf_size);
+        if(fft_status != rocfft_status_success)
+            throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
+
         assert(work_buf_size == 0); // simple 1D inplace fft doesn't need extra working buffer
     }
 
     /// execution
     for(auto& it : ffts)
     {
-        CHECK_ROCFFT_ERR(
-            rocfft_execute(it.plan, (void**)&(it.gpu_buf), (void**)&(it.gpu_buf), nullptr));
+        fft_status = rocfft_execute(it.plan, (void**)&(it.gpu_buf), (void**)&(it.gpu_buf), nullptr);
+        if(fft_status != rocfft_status_success)
+            throw std::runtime_error("rocfft_execute failed.");
     }
 
     /// wait and copy back
     for(auto& it : ffts)
     {
-        CHECK_HIP_ERR(hipStreamSynchronize(it.stream));
-        CHECK_HIP_ERR(hipMemcpy(it.cpu_buf, it.gpu_buf, total_bytes, hipMemcpyDeviceToHost));
+        if(hipStreamSynchronize(it.stream) != hipSuccess)
+            throw std::runtime_error("hipStreamSynchronize failed.");
+        hip_status = hipMemcpy(it.cpu_buf.data(), it.gpu_buf, total_bytes, hipMemcpyDeviceToHost);
+        if(hip_status != hipSuccess)
+            throw std::runtime_error("hipMemcpy failed.");
     }
 
     /// clean up
     for(auto& it : ffts)
     {
-        CHECK_ROCFFT_ERR(rocfft_plan_destroy(it.plan));
-        CHECK_ROCFFT_ERR(rocfft_execution_info_destroy(it.info));
-        CHECK_HIP_ERR(hipStreamDestroy(it.stream));
-        CHECK_HIP_ERR(hipFree(it.gpu_buf));
-        delete[] it.cpu_buf;
+        fft_status = rocfft_plan_destroy(it.plan);
+        if(fft_status != rocfft_status_success)
+            throw std::runtime_error("rocfft_plan_destroy failed.");
+
+        fft_status = rocfft_execution_info_destroy(it.info);
+        if(fft_status != rocfft_status_success)
+            throw std::runtime_error("rocfft_execution_info_destroy failed.");
+
+        if(hipStreamDestroy(it.stream) != hipSuccess)
+            throw std::runtime_error("hipStreamDestroy failed.");
+        if(hipFree(it.gpu_buf) != hipSuccess)
+            throw std::runtime_error("hipFree failed.");
     }
 
-    rocfft_cleanup();
+    if(rocfft_cleanup() != rocfft_status_success)
+        throw std::runtime_error("rocfft_cleanup failed.");
     return 0;
 }
diff -Nru rocfft-5.5.0/clients/tests/CMakeLists.txt rocfft-5.7.1/clients/tests/CMakeLists.txt
--- rocfft-5.5.0/clients/tests/CMakeLists.txt	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/CMakeLists.txt	2023-08-09 16:19:51.000000000 +0000
@@ -60,8 +60,8 @@
   find_package( ROCM 0.7.3 REQUIRED )
 endif()
 
-if( NOT rocrand_FOUND )
-  find_package( rocrand REQUIRED )
+if( NOT hiprand_FOUND )
+  find_package( hiprand REQUIRED )
 endif()
 
 include( ROCMInstallTargets )
@@ -161,18 +161,15 @@
 
   # FFTW we build is always threaded
   set( FFTW_MULTITHREAD TRUE )
-endif()
 
-if( BUILD_FFTW OR NOT FFTW_FOUND )
   add_dependencies( rocfft-test fftw_double fftw_single )
+  rocm_install(
+    FILES ${FFTW_LIBRARIES}
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/fftw
+    COMPONENT clients-common
+  )
 endif()
 
-rocm_install(
-  FILES ${FFTW_LIBRARIES}
-  DESTINATION ${CMAKE_INSTALL_LIBDIR}/fftw
-  COMPONENT clients-common
-)
-
 set( rocfft-test_include_dirs
   $<BUILD_INTERFACE:${Boost_INCLUDE_DIRS}>
   $<BUILD_INTERFACE:${FFTW_INCLUDES}>
@@ -216,7 +213,7 @@
   PRIVATE
   hip::device
   roc::rocfft
-  roc::rocrand
+  hip::hiprand
   ${rocfft-test_link_libs}
   )
 
@@ -234,13 +231,13 @@
 
 if( BUILD_CLIENTS_TESTS_OPENMP )
   if( CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" )  
-    target_compile_options( rocfft-test PRIVATE -fopenmp -DBUILD_CLIENTS_TESTS_OPENMP )
+    target_compile_options( rocfft-test PRIVATE -fopenmp )
     target_link_libraries( rocfft-test PRIVATE -fopenmp -L${HIP_CLANG_ROOT}/lib -Wl,-rpath=${HIP_CLANG_ROOT}/lib )
     target_include_directories( rocfft-test PRIVATE ${HIP_CLANG_ROOT}/include )
   else()
     if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-      set(OpenMP_CXX_FLAG "-fopenmp=libomp")
-      target_link_libraries(rocfft-test ${OpenMP_CXX_LIBRARIES})
+      target_compile_options( rocfft-test PRIVATE -fopenmp=libomp )
+      target_link_options( rocfft-test PRIVATE -fopenmp=libomp )
     endif()
   endif()
 endif()
@@ -289,6 +286,6 @@
     C:/Windows/System32/libomp140*.dll
   )
   foreach( file_i ${third_party_dlls})
-    add_custom_command( TARGET rocfft-test POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy ${file_i} ${PROJECT_BINARY_DIR}/staging )
+    add_custom_command( TARGET rocfft-test POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy ${file_i} $<TARGET_FILE_DIR:rocfft-test> )
   endforeach( file_i )
 endif()
diff -Nru rocfft-5.5.0/clients/tests/accuracy_test.cpp rocfft-5.7.1/clients/tests/accuracy_test.cpp
--- rocfft-5.5.0/clients/tests/accuracy_test.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/accuracy_test.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2022 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -19,53 +19,44 @@
 // THE SOFTWARE.
 
 #include "accuracy_test.h"
+#include "../../shared/rocfft_complex.h"
 
-#include <hip/hip_complex.h>
 #include <hip/hip_runtime.h>
 
-__host__ __device__ float multiply_by_scalar(float a, double b)
-{
-    return a * b;
-}
-__host__ __device__ float2 multiply_by_scalar(float2 a, double b)
-{
-    return hipCmulf(a, make_float2(b, 0.0));
-}
-__host__ __device__ double multiply_by_scalar(double a, double b)
-{
-    return a * b;
-}
-__host__ __device__ double2 multiply_by_scalar(double2 a, double b)
+// load/store callbacks - cbdata in each is actually a scalar double
+// with a number to apply to each element
+template <typename Tdata>
+__host__ __device__ Tdata load_callback(Tdata* input, size_t offset, void* cbdata, void* sharedMem)
 {
-    return hipCmul(a, make_double2(b, 0.0));
+    auto testdata = static_cast<const callback_test_data*>(cbdata);
+    // multiply each element by scalar
+    if(input == testdata->base)
+        return input[offset] * testdata->scalar;
+    // wrong base address passed, return something obviously wrong
+    else
+    {
+        // wrong base address passed, return something obviously wrong
+        return input[0];
+    }
 }
 
-__host__ __device__ float add_scalar(float a, double b)
-{
-    return a + b;
-}
-__host__ __device__ float2 add_scalar(float2 a, double b)
-{
-    return hipCaddf(a, make_float2(b, 0.0));
-}
-__host__ __device__ double add_scalar(double a, double b)
-{
-    return a + b;
-}
-__host__ __device__ double2 add_scalar(double2 a, double b)
-{
-    return hipCadd(a, make_double2(b, 0.0));
-}
+__device__ auto load_callback_dev_half           = load_callback<_Float16>;
+__device__ auto load_callback_dev_complex_half   = load_callback<rocfft_complex<_Float16>>;
+__device__ auto load_callback_dev_float          = load_callback<float>;
+__device__ auto load_callback_dev_complex_float  = load_callback<rocfft_complex<float>>;
+__device__ auto load_callback_dev_double         = load_callback<double>;
+__device__ auto load_callback_dev_complex_double = load_callback<rocfft_complex<double>>;
 
 // load/store callbacks - cbdata in each is actually a scalar double
 // with a number to apply to each element
 template <typename Tdata>
-__host__ __device__ Tdata load_callback(Tdata* input, size_t offset, void* cbdata, void* sharedMem)
+__host__ __device__ Tdata
+    load_callback_round_trip_inverse(Tdata* input, size_t offset, void* cbdata, void* sharedMem)
 {
     auto testdata = static_cast<const callback_test_data*>(cbdata);
-    // multiply each element by scalar
+    // subtract each element by scalar
     if(input == testdata->base)
-        return multiply_by_scalar(input[offset], testdata->scalar);
+        return input[offset] - testdata->scalar;
     // wrong base address passed, return something obviously wrong
     else
     {
@@ -74,12 +65,22 @@
     }
 }
 
-__device__ auto load_callback_dev_float   = load_callback<float>;
-__device__ auto load_callback_dev_float2  = load_callback<float2>;
-__device__ auto load_callback_dev_double  = load_callback<double>;
-__device__ auto load_callback_dev_double2 = load_callback<double2>;
-
-void* get_load_callback_host(fft_array_type itype, fft_precision precision)
+__device__ auto load_callback_round_trip_inverse_dev_half
+    = load_callback_round_trip_inverse<_Float16>;
+__device__ auto load_callback_round_trip_inverse_dev_complex_half
+    = load_callback_round_trip_inverse<rocfft_complex<_Float16>>;
+__device__ auto load_callback_round_trip_inverse_dev_float
+    = load_callback_round_trip_inverse<float>;
+__device__ auto load_callback_round_trip_inverse_dev_complex_float
+    = load_callback_round_trip_inverse<rocfft_complex<float>>;
+__device__ auto load_callback_round_trip_inverse_dev_double
+    = load_callback_round_trip_inverse<double>;
+__device__ auto load_callback_round_trip_inverse_dev_complex_double
+    = load_callback_round_trip_inverse<rocfft_complex<double>>;
+
+void* get_load_callback_host(fft_array_type itype,
+                             fft_precision  precision,
+                             bool           round_trip_inverse = false)
 {
     void* load_callback_host = nullptr;
     switch(itype)
@@ -89,16 +90,56 @@
     {
         switch(precision)
         {
+        case fft_precision_half:
+            if(round_trip_inverse)
+            {
+                EXPECT_EQ(hipMemcpyFromSymbol(
+                              &load_callback_host,
+                              HIP_SYMBOL(load_callback_round_trip_inverse_dev_complex_half),
+                              sizeof(void*)),
+                          hipSuccess);
+            }
+            else
+            {
+                EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host,
+                                              HIP_SYMBOL(load_callback_dev_complex_half),
+                                              sizeof(void*)),
+                          hipSuccess);
+            }
+            return load_callback_host;
         case fft_precision_single:
-            EXPECT_EQ(hipMemcpyFromSymbol(
-                          &load_callback_host, HIP_SYMBOL(load_callback_dev_float2), sizeof(void*)),
-                      hipSuccess);
+            if(round_trip_inverse)
+            {
+                EXPECT_EQ(hipMemcpyFromSymbol(
+                              &load_callback_host,
+                              HIP_SYMBOL(load_callback_round_trip_inverse_dev_complex_float),
+                              sizeof(void*)),
+                          hipSuccess);
+            }
+            else
+            {
+                EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host,
+                                              HIP_SYMBOL(load_callback_dev_complex_float),
+                                              sizeof(void*)),
+                          hipSuccess);
+            }
             return load_callback_host;
         case fft_precision_double:
-            EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host,
-                                          HIP_SYMBOL(load_callback_dev_double2),
-                                          sizeof(void*)),
-                      hipSuccess);
+            if(round_trip_inverse)
+            {
+                EXPECT_EQ(hipMemcpyFromSymbol(
+                              &load_callback_host,
+                              HIP_SYMBOL(load_callback_round_trip_inverse_dev_complex_double),
+                              sizeof(void*)),
+                          hipSuccess);
+            }
+            else
+            {
+                EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host,
+                                              HIP_SYMBOL(load_callback_dev_complex_double),
+                                              sizeof(void*)),
+                          hipSuccess);
+            }
             return load_callback_host;
         }
     }
@@ -106,15 +147,56 @@
     {
         switch(precision)
         {
+        case fft_precision_half:
+            if(round_trip_inverse)
+            {
+                EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host,
+                                              HIP_SYMBOL(load_callback_round_trip_inverse_dev_half),
+                                              sizeof(void*)),
+                          hipSuccess);
+            }
+            else
+            {
+                EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host,
+                                              HIP_SYMBOL(load_callback_dev_half),
+                                              sizeof(void*)),
+                          hipSuccess);
+            }
+            return load_callback_host;
         case fft_precision_single:
-            EXPECT_EQ(hipMemcpyFromSymbol(
-                          &load_callback_host, HIP_SYMBOL(load_callback_dev_float), sizeof(void*)),
-                      hipSuccess);
+            if(round_trip_inverse)
+            {
+                EXPECT_EQ(
+                    hipMemcpyFromSymbol(&load_callback_host,
+                                        HIP_SYMBOL(load_callback_round_trip_inverse_dev_float),
+                                        sizeof(void*)),
+                    hipSuccess);
+            }
+            else
+            {
+                EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host,
+                                              HIP_SYMBOL(load_callback_dev_float),
+                                              sizeof(void*)),
+                          hipSuccess);
+            }
             return load_callback_host;
         case fft_precision_double:
-            EXPECT_EQ(hipMemcpyFromSymbol(
-                          &load_callback_host, HIP_SYMBOL(load_callback_dev_double), sizeof(void*)),
-                      hipSuccess);
+            if(round_trip_inverse)
+            {
+                EXPECT_EQ(
+                    hipMemcpyFromSymbol(&load_callback_host,
+                                        HIP_SYMBOL(load_callback_round_trip_inverse_dev_double),
+                                        sizeof(void*)),
+                    hipSuccess);
+            }
+            else
+            {
+                EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host,
+                                              HIP_SYMBOL(load_callback_dev_double),
+                                              sizeof(void*)),
+                          hipSuccess);
+            }
+
             return load_callback_host;
         }
     }
@@ -132,16 +214,45 @@
     // add scalar to each element
     if(output == testdata->base)
     {
-        output[offset] = add_scalar(element, testdata->scalar);
+        output[offset] = element + testdata->scalar;
     }
     // otherwise, wrong base address passed, just don't write
 }
-__device__ auto store_callback_dev_float   = store_callback<float>;
-__device__ auto store_callback_dev_float2  = store_callback<float2>;
-__device__ auto store_callback_dev_double  = store_callback<double>;
-__device__ auto store_callback_dev_double2 = store_callback<double2>;
+__device__ auto store_callback_dev_half           = store_callback<_Float16>;
+__device__ auto store_callback_dev_complex_half   = store_callback<rocfft_complex<_Float16>>;
+__device__ auto store_callback_dev_float          = store_callback<float>;
+__device__ auto store_callback_dev_complex_float  = store_callback<rocfft_complex<float>>;
+__device__ auto store_callback_dev_double         = store_callback<double>;
+__device__ auto store_callback_dev_complex_double = store_callback<rocfft_complex<double>>;
 
-void* get_store_callback_host(fft_array_type otype, fft_precision precision)
+template <typename Tdata>
+__host__ __device__ static void store_callback_round_trip_inverse(
+    Tdata* output, size_t offset, Tdata element, void* cbdata, void* sharedMem)
+{
+    auto testdata = static_cast<callback_test_data*>(cbdata);
+    // add scalar to each element
+    if(output == testdata->base)
+    {
+        output[offset] = element / testdata->scalar;
+    }
+    // otherwise, wrong base address passed, just don't write
+}
+__device__ auto store_callback_round_trip_inverse_dev_half
+    = store_callback_round_trip_inverse<_Float16>;
+__device__ auto store_callback_round_trip_inverse_dev_complex_half
+    = store_callback_round_trip_inverse<rocfft_complex<_Float16>>;
+__device__ auto store_callback_round_trip_inverse_dev_float
+    = store_callback_round_trip_inverse<float>;
+__device__ auto store_callback_round_trip_inverse_dev_complex_float
+    = store_callback_round_trip_inverse<rocfft_complex<float>>;
+__device__ auto store_callback_round_trip_inverse_dev_double
+    = store_callback_round_trip_inverse<double>;
+__device__ auto store_callback_round_trip_inverse_dev_complex_double
+    = store_callback_round_trip_inverse<rocfft_complex<double>>;
+
+void* get_store_callback_host(fft_array_type otype,
+                              fft_precision  precision,
+                              bool           round_trip_inverse = false)
 {
     void* store_callback_host = nullptr;
     switch(otype)
@@ -151,17 +262,56 @@
     {
         switch(precision)
         {
+        case fft_precision_half:
+            if(round_trip_inverse)
+            {
+                EXPECT_EQ(hipMemcpyFromSymbol(
+                              &store_callback_host,
+                              HIP_SYMBOL(store_callback_round_trip_inverse_dev_complex_half),
+                              sizeof(void*)),
+                          hipSuccess);
+            }
+            else
+            {
+                EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host,
+                                              HIP_SYMBOL(store_callback_dev_complex_half),
+                                              sizeof(void*)),
+                          hipSuccess);
+            }
+            return store_callback_host;
         case fft_precision_single:
-            EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host,
-                                          HIP_SYMBOL(store_callback_dev_float2),
-                                          sizeof(void*)),
-                      hipSuccess);
+            if(round_trip_inverse)
+            {
+                EXPECT_EQ(hipMemcpyFromSymbol(
+                              &store_callback_host,
+                              HIP_SYMBOL(store_callback_round_trip_inverse_dev_complex_float),
+                              sizeof(void*)),
+                          hipSuccess);
+            }
+            else
+            {
+                EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host,
+                                              HIP_SYMBOL(store_callback_dev_complex_float),
+                                              sizeof(void*)),
+                          hipSuccess);
+            }
             return store_callback_host;
         case fft_precision_double:
-            EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host,
-                                          HIP_SYMBOL(store_callback_dev_double2),
-                                          sizeof(void*)),
-                      hipSuccess);
+            if(round_trip_inverse)
+            {
+                EXPECT_EQ(hipMemcpyFromSymbol(
+                              &store_callback_host,
+                              HIP_SYMBOL(store_callback_round_trip_inverse_dev_complex_double),
+                              sizeof(void*)),
+                          hipSuccess);
+            }
+            else
+            {
+                EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host,
+                                              HIP_SYMBOL(store_callback_dev_complex_double),
+                                              sizeof(void*)),
+                          hipSuccess);
+            }
             return store_callback_host;
         }
     }
@@ -169,17 +319,56 @@
     {
         switch(precision)
         {
+        case fft_precision_half:
+            if(round_trip_inverse)
+            {
+                EXPECT_EQ(
+                    hipMemcpyFromSymbol(&store_callback_host,
+                                        HIP_SYMBOL(store_callback_round_trip_inverse_dev_half),
+                                        sizeof(void*)),
+                    hipSuccess);
+            }
+            else
+            {
+                EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host,
+                                              HIP_SYMBOL(store_callback_dev_half),
+                                              sizeof(void*)),
+                          hipSuccess);
+            }
+            return store_callback_host;
         case fft_precision_single:
-            EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host,
-                                          HIP_SYMBOL(store_callback_dev_float),
-                                          sizeof(void*)),
-                      hipSuccess);
+            if(round_trip_inverse)
+            {
+                EXPECT_EQ(
+                    hipMemcpyFromSymbol(&store_callback_host,
+                                        HIP_SYMBOL(store_callback_round_trip_inverse_dev_float),
+                                        sizeof(void*)),
+                    hipSuccess);
+            }
+            else
+            {
+                EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host,
+                                              HIP_SYMBOL(store_callback_dev_float),
+                                              sizeof(void*)),
+                          hipSuccess);
+            }
             return store_callback_host;
         case fft_precision_double:
-            EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host,
-                                          HIP_SYMBOL(store_callback_dev_double),
-                                          sizeof(void*)),
-                      hipSuccess);
+            if(round_trip_inverse)
+            {
+                EXPECT_EQ(
+                    hipMemcpyFromSymbol(&store_callback_host,
+                                        HIP_SYMBOL(store_callback_round_trip_inverse_dev_double),
+                                        sizeof(void*)),
+                    hipSuccess);
+            }
+            else
+            {
+                EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host,
+                                              HIP_SYMBOL(store_callback_dev_double),
+                                              sizeof(void*)),
+                          hipSuccess);
+            }
             return store_callback_host;
         }
     }
@@ -190,7 +379,7 @@
 }
 
 // Apply store callback if necessary
-void apply_store_callback(const fft_params& params, fftw_data_t& output)
+void apply_store_callback(const fft_params& params, std::vector<hostbuf>& output)
 {
     if(!params.run_callbacks && params.scale_factor == 1.0)
         return;
@@ -206,12 +395,28 @@
     {
         switch(params.precision)
         {
+        case fft_precision_half:
+        {
+            const size_t elem_size = sizeof(rocfft_complex<_Float16>);
+            const size_t num_elems = output.front().size() / elem_size;
+
+            auto output_begin = reinterpret_cast<rocfft_complex<_Float16>*>(output.front().data());
+            for(size_t i = 0; i < num_elems; ++i)
+            {
+                auto& element = output_begin[i];
+                if(params.scale_factor != 1.0)
+                    element = element * params.scale_factor;
+                if(params.run_callbacks)
+                    store_callback(output_begin, i, element, &cbdata, nullptr);
+            }
+            break;
+        }
         case fft_precision_single:
         {
-            const size_t elem_size = sizeof(std::complex<float>);
+            const size_t elem_size = sizeof(rocfft_complex<float>);
             const size_t num_elems = output.front().size() / elem_size;
 
-            auto output_begin = reinterpret_cast<float2*>(output.front().data());
+            auto output_begin = reinterpret_cast<rocfft_complex<float>*>(output.front().data());
             for(size_t i = 0; i < num_elems; ++i)
             {
                 auto& element = output_begin[i];
@@ -224,10 +429,10 @@
         }
         case fft_precision_double:
         {
-            const size_t elem_size = sizeof(std::complex<double>);
+            const size_t elem_size = sizeof(rocfft_complex<double>);
             const size_t num_elems = output.front().size() / elem_size;
 
-            auto output_begin = reinterpret_cast<double2*>(output.front().data());
+            auto output_begin = reinterpret_cast<rocfft_complex<double>*>(output.front().data());
             for(size_t i = 0; i < num_elems; ++i)
             {
                 auto& element = output_begin[i];
@@ -247,14 +452,31 @@
         // planar wouldn't run callbacks, but we could still want scaling
         switch(params.precision)
         {
+        case fft_precision_half:
+        {
+            const size_t elem_size = sizeof(rocfft_complex<_Float16>);
+            for(auto& buf : output)
+            {
+                const size_t num_elems = buf.size() / elem_size;
+
+                auto output_begin = reinterpret_cast<rocfft_complex<_Float16>*>(buf.data());
+                for(size_t i = 0; i < num_elems; ++i)
+                {
+                    auto& element = output_begin[i];
+                    if(params.scale_factor != 1.0)
+                        element = element * params.scale_factor;
+                }
+            }
+            break;
+        }
         case fft_precision_single:
         {
-            const size_t elem_size = sizeof(std::complex<float>);
+            const size_t elem_size = sizeof(rocfft_complex<float>);
             for(auto& buf : output)
             {
                 const size_t num_elems = buf.size() / elem_size;
 
-                auto output_begin = reinterpret_cast<float2*>(buf.data());
+                auto output_begin = reinterpret_cast<rocfft_complex<float>*>(buf.data());
                 for(size_t i = 0; i < num_elems; ++i)
                 {
                     auto& element = output_begin[i];
@@ -266,12 +488,12 @@
         }
         case fft_precision_double:
         {
-            const size_t elem_size = sizeof(std::complex<double>);
+            const size_t elem_size = sizeof(rocfft_complex<double>);
             for(auto& buf : output)
             {
                 const size_t num_elems = buf.size() / elem_size;
 
-                auto output_begin = reinterpret_cast<double2*>(buf.data());
+                auto output_begin = reinterpret_cast<rocfft_complex<double>*>(buf.data());
                 for(size_t i = 0; i < num_elems; ++i)
                 {
                     auto& element = output_begin[i];
@@ -288,6 +510,22 @@
     {
         switch(params.precision)
         {
+        case fft_precision_half:
+        {
+            const size_t elem_size = sizeof(_Float16);
+            const size_t num_elems = output.front().size() / elem_size;
+
+            auto output_begin = reinterpret_cast<_Float16*>(output.front().data());
+            for(size_t i = 0; i < num_elems; ++i)
+            {
+                auto& element = output_begin[i];
+                if(params.scale_factor != 1.0)
+                    element = element * params.scale_factor;
+                if(params.run_callbacks)
+                    store_callback(output_begin, i, element, &cbdata, nullptr);
+            }
+            break;
+        }
         case fft_precision_single:
         {
             const size_t elem_size = sizeof(float);
@@ -330,7 +568,7 @@
 }
 
 // apply load callback if necessary
-void apply_load_callback(const fft_params& params, fftw_data_t& input)
+void apply_load_callback(const fft_params& params, std::vector<hostbuf>& input)
 {
     if(!params.run_callbacks)
         return;
@@ -348,12 +586,24 @@
     {
         switch(params.precision)
         {
+        case fft_precision_half:
+        {
+            const size_t elem_size = sizeof(rocfft_complex<_Float16>);
+            const size_t num_elems = input.front().size() / elem_size;
+
+            auto input_begin = reinterpret_cast<rocfft_complex<_Float16>*>(input.front().data());
+            for(size_t i = 0; i < num_elems; ++i)
+            {
+                input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr);
+            }
+            break;
+        }
         case fft_precision_single:
         {
-            const size_t elem_size = sizeof(std::complex<float>);
+            const size_t elem_size = sizeof(rocfft_complex<float>);
             const size_t num_elems = input.front().size() / elem_size;
 
-            auto input_begin = reinterpret_cast<float2*>(input.front().data());
+            auto input_begin = reinterpret_cast<rocfft_complex<float>*>(input.front().data());
             for(size_t i = 0; i < num_elems; ++i)
             {
                 input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr);
@@ -362,10 +612,10 @@
         }
         case fft_precision_double:
         {
-            const size_t elem_size = sizeof(std::complex<double>);
+            const size_t elem_size = sizeof(rocfft_complex<double>);
             const size_t num_elems = input.front().size() / elem_size;
 
-            auto input_begin = reinterpret_cast<double2*>(input.front().data());
+            auto input_begin = reinterpret_cast<rocfft_complex<double>*>(input.front().data());
             for(size_t i = 0; i < num_elems; ++i)
             {
                 input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr);
@@ -379,6 +629,18 @@
     {
         switch(params.precision)
         {
+        case fft_precision_half:
+        {
+            const size_t elem_size = sizeof(_Float16);
+            const size_t num_elems = input.front().size() / elem_size;
+
+            auto input_begin = reinterpret_cast<_Float16*>(input.front().data());
+            for(size_t i = 0; i < num_elems; ++i)
+            {
+                input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr);
+            }
+            break;
+        }
         case fft_precision_single:
         {
             const size_t elem_size = sizeof(float);
diff -Nru rocfft-5.5.0/clients/tests/accuracy_test.h rocfft-5.7.1/clients/tests/accuracy_test.h
--- rocfft-5.5.0/clients/tests/accuracy_test.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/accuracy_test.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -24,12 +24,15 @@
 #define ACCURACY_TEST
 
 #include <algorithm>
+#include <functional>
 #include <future>
 #include <iterator>
+#include <string>
 #include <vector>
 
+#include "../../shared/enum_to_string.h"
+#include "../../shared/fft_params.h"
 #include "../../shared/gpubuf.h"
-#include "../fft_params.h"
 #include "fftw_transform.h"
 #include "rocfft_against_fftw.h"
 #include "test_params.h"
@@ -39,24 +42,48 @@
 
 static const size_t ONE_GiB = 1 << 30;
 
-typedef std::vector<std::vector<char, fftwAllocator<char>>> fftw_data_t;
+inline size_t bytes_to_GiB(const size_t bytes)
+{
+    return bytes == 0 ? 0 : (bytes - 1 + ONE_GiB) / ONE_GiB;
+}
 
 typedef std::tuple<fft_transform_type, fft_result_placement, fft_array_type, fft_array_type>
     type_place_io_t;
 
-// Estimate the amount of host memory needed.
-inline size_t needed_ram(const fft_params& params, const int verbose)
+// Remember the results of the last FFT we computed with FFTW.  Tests
+// are ordered so that later cases can often reuse this result.
+struct last_cpu_fft_cache
 {
-    // We need at most 3 copies of the raw data: 2 are strictly
-    // required (input + output) but we keep a third copy around to
-    // save effort recomputing input for a smaller batch size or
-    // precision.
-    //
-    // This calculation is assuming contiguous data - noncontiguous
-    // temp buffers may be briefly required to mirror the data layout
-    // on the GPU, but they're assumed to require a close enough
-    // amount of space for the purposes of this estimate.
-    size_t needed_ram = 3
+    // keys to the cache
+    std::vector<size_t> length;
+    size_t              nbatch         = 0;
+    fft_transform_type  transform_type = fft_transform_type_complex_forward;
+    bool                run_callbacks  = false;
+    fft_precision       precision      = fft_precision_single;
+
+    // FFTW input/output
+    std::vector<hostbuf> cpu_input;
+    std::vector<hostbuf> cpu_output;
+};
+extern last_cpu_fft_cache last_cpu_fft_data;
+
+struct system_memory
+{
+    size_t total_bytes = 0;
+    size_t free_bytes  = 0;
+};
+extern system_memory start_memory;
+
+system_memory get_system_memory();
+
+// Estimate the amount of host memory needed for buffers.
+inline size_t needed_ram_buffers(const fft_params& params, const int verbose)
+{
+    // This calculation is assuming contiguous data but noncontiguous buffers
+    // are assumed to require a close enough amount of space for the purposes
+    // of this estimate.
+
+    size_t needed_ram = 6
                         * std::accumulate(params.length.begin(),
                                           params.length.end(),
                                           static_cast<size_t>(1),
@@ -70,6 +97,9 @@
     }
     switch(params.precision)
     {
+    case fft_precision_half:
+        needed_ram *= 2;
+        break;
     case fft_precision_single:
         needed_ram *= 4;
         break;
@@ -82,7 +112,87 @@
 
     if(verbose)
     {
-        std::cout << "required host memory (GiB): " << needed_ram / ONE_GiB << std::endl;
+        std::cout << "required host memory for buffers (GiB): " << bytes_to_GiB(needed_ram) << "\n";
+    }
+
+    return needed_ram;
+}
+
+template <typename Tfloat>
+bool fftw_plan_uses_bluestein(const typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan)
+{
+#ifdef FFTW_HAVE_SPRINT_PLAN
+    char*       print_plan_c_str = fftw_sprint_plan<Tfloat>(cpu_plan);
+    std::string print_plan(print_plan_c_str);
+    free(print_plan_c_str);
+    return print_plan.find("bluestein") != std::string::npos;
+#else
+    // assume worst case (bluestein is always used)
+    return true;
+#endif
+}
+
+// Estimate the amount of host memory needed for fftw.
+template <typename Tfloat>
+inline size_t needed_ram_fftw(const fft_params&                                  contiguous_params,
+                              const typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan,
+                              const int                                          verbose)
+{
+    size_t total_length = std::accumulate(contiguous_params.length.begin(),
+                                          contiguous_params.length.end(),
+                                          static_cast<size_t>(1),
+                                          std::multiplies<size_t>());
+    size_t needed_ram   = 0;
+    // Detect Bluestein in plan
+    if(fftw_plan_uses_bluestein<Tfloat>(cpu_plan))
+    {
+        for(size_t dim : contiguous_params.length)
+        {
+            unsigned int needed_ram_dim = dim;
+
+            // Next-plus-one-power-of-two multiplied any other lengths
+            needed_ram_dim--;
+
+            needed_ram_dim |= needed_ram_dim >> 2;
+            needed_ram_dim |= needed_ram_dim >> 4;
+            needed_ram_dim |= needed_ram_dim >> 8;
+            needed_ram_dim |= needed_ram_dim >> 16;
+
+            needed_ram_dim++;
+
+            needed_ram_dim *= 2 * (total_length / dim);
+
+            if(needed_ram_dim > needed_ram)
+            {
+                needed_ram = needed_ram_dim;
+            }
+        }
+    }
+
+    // Account for precision and data type:
+    if(contiguous_params.transform_type != fft_transform_type_real_forward
+       && contiguous_params.transform_type != fft_transform_type_real_inverse)
+    {
+        needed_ram *= 2;
+    }
+    switch(contiguous_params.precision)
+    {
+    case fft_precision_half:
+        needed_ram *= 2;
+        break;
+    case fft_precision_single:
+        needed_ram *= 4;
+        break;
+    case fft_precision_double:
+        needed_ram *= 8;
+        break;
+    }
+
+    needed_ram *= contiguous_params.nbatch;
+
+    if(verbose)
+    {
+        std::cout << "required host memory for FFTW (GiB): " << bytes_to_GiB(needed_ram) << "\n";
     }
 
     return needed_ram;
@@ -102,37 +212,21 @@
     }
 };
 
-// Remember the results of the last FFT we computed with FFTW.  Tests
-// are ordered so that later cases can often reuse this result.
-struct last_cpu_fft_cache
-{
-    // keys to the cache
-    std::vector<size_t> length;
-    size_t              nbatch         = 0;
-    fft_transform_type  transform_type = fft_transform_type_complex_forward;
-    bool                run_callbacks  = false;
-    fft_precision       precision      = fft_precision_single;
-
-    // FFTW input/output
-    fftw_data_t cpu_input;
-    fftw_data_t cpu_output;
-};
-extern last_cpu_fft_cache last_cpu_fft_data;
-
 const static std::vector<size_t> batch_range = {2, 1};
 
-const static std::vector<fft_precision> precision_range
+const static std::vector<fft_precision> precision_range_full
+    = {fft_precision_double, fft_precision_single, fft_precision_half};
+const static std::vector<fft_precision> precision_range_sp_dp
     = {fft_precision_double, fft_precision_single};
+
 const static std::vector<fft_result_placement> place_range
     = {fft_placement_inplace, fft_placement_notinplace};
-const static std::vector<fft_transform_type> trans_type_range = {fft_transform_type_complex_forward,
-                                                                 fft_transform_type_complex_inverse,
-                                                                 fft_transform_type_real_forward,
-                                                                 fft_transform_type_real_inverse};
+const static std::vector<fft_transform_type> trans_type_range
+    = {fft_transform_type_complex_forward, fft_transform_type_real_forward};
 const static std::vector<fft_transform_type> trans_type_range_complex
-    = {fft_transform_type_complex_forward, fft_transform_type_complex_inverse};
+    = {fft_transform_type_complex_forward};
 const static std::vector<fft_transform_type> trans_type_range_real
-    = {fft_transform_type_real_forward, fft_transform_type_real_inverse};
+    = {fft_transform_type_real_forward};
 
 // Given a vector of vector of lengths, generate all unique permutations.
 // Add an optional vector of ad-hoc lengths to the result.
@@ -337,7 +431,6 @@
             // something to be passed to generate_lengths
             if(lengths.empty() || lengths.size() > 3)
             {
-                assert(false);
                 continue;
             }
             {
@@ -389,6 +482,47 @@
                                                 }
                                             }
                                             param.validate();
+
+                                            // Keeping the random number generator here
+                                            // allows one to run the same tests for a given
+                                            // random seed; ie the test suite is repeatable.
+                                            std::hash<std::string>           hasher;
+                                            std::ranlux24_base               gen(random_seed
+                                                                   + hasher(param.token()));
+                                            std::uniform_real_distribution<> dis(0.0, 1.0);
+
+                                            if(param.is_planar())
+                                            {
+                                                const double roll = dis(gen);
+                                                if(roll > planar_prob)
+                                                {
+                                                    if(verbose > 4)
+                                                    {
+                                                        std::cout << "Planar transform skipped "
+                                                                     "(planar_prob: "
+                                                                  << planar_prob << " > " << roll
+                                                                  << ")\n";
+                                                    }
+                                                    continue;
+                                                }
+                                            }
+                                            if(run_callbacks)
+                                            {
+                                                const double roll = dis(gen);
+                                                if(roll > callback_prob)
+                                                {
+
+                                                    if(verbose > 4)
+                                                    {
+                                                        std::cout << "Callback transform skipped "
+                                                                     "(planar_prob: "
+                                                                  << planar_prob << " > " << roll
+                                                                  << ")\n";
+                                                    }
+                                                    continue;
+                                                }
+                                            }
+
                                             if(param.valid(0))
                                             {
                                                 params.push_back(param);
@@ -471,7 +605,7 @@
                                  const bool                               planar,
                                  const bool                               run_callbacks = false)
 {
-    return param_generator_base(trans_type_range_complex,
+    return param_generator_base(trans_type_range_real,
                                 v_lengths,
                                 precision_range,
                                 batch_range,
@@ -506,40 +640,56 @@
     void* base;
 };
 
-void* get_load_callback_host(fft_array_type itype, fft_precision precision);
-void  apply_load_callback(const fft_params& params, fftw_data_t& input);
-void  apply_store_callback(const fft_params& params, fftw_data_t& output);
-void* get_store_callback_host(fft_array_type otype, fft_precision precision);
+void* get_load_callback_host(fft_array_type itype,
+                             fft_precision  precision,
+                             bool           round_trip_inverse);
+void  apply_load_callback(const fft_params& params, std::vector<hostbuf>& input);
+void  apply_store_callback(const fft_params& params, std::vector<hostbuf>& output);
+void* get_store_callback_host(fft_array_type otype,
+                              fft_precision  precision,
+                              bool           round_trip_inverse);
+
+static auto allocate_cpu_fft_buffer(const fft_precision        precision,
+                                    const fft_array_type       type,
+                                    const std::vector<size_t>& size)
+{
+    // FFTW does not support half-precision, so we do single instead.
+    // So if we need to do a half-precision FFTW transform, allocate
+    // enough buffer for single-precision instead.
+    return allocate_host_buffer(
+        precision == fft_precision_half ? fft_precision_single : precision, type, size);
+}
 
 template <typename Tfloat>
 inline void execute_cpu_fft(fft_params&                                  params,
                             fft_params&                                  contiguous_params,
                             typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan,
-                            fftw_data_t&                                 cpu_input,
-                            fftw_data_t&                                 cpu_output)
+                            std::vector<hostbuf>&                        cpu_input,
+                            std::vector<hostbuf>&                        cpu_output)
 {
     // CPU output might not be allocated already for us, if FFTW never
     // needed an output buffer during planning
     if(cpu_output.empty())
-        cpu_output = allocate_host_buffer<fftwAllocator<char>>(
+        cpu_output = allocate_cpu_fft_buffer(
             contiguous_params.precision, contiguous_params.otype, contiguous_params.osize);
 
     // If this is either C2R or callbacks are enabled, the
     // input will be modified.  So we need to modify the copy instead.
-    fftw_data_t  cpu_input_copy;
-    fftw_data_t* input_ptr = &cpu_input;
+    std::vector<hostbuf>  cpu_input_copy(cpu_input.size());
+    std::vector<hostbuf>* input_ptr = &cpu_input;
     if(params.run_callbacks || contiguous_params.transform_type == fft_transform_type_real_inverse)
     {
-        cpu_input_copy = cpu_input;
-        input_ptr      = &cpu_input_copy;
+        for(size_t i = 0; i < cpu_input.size(); ++i)
+        {
+            cpu_input_copy[i] = cpu_input[i].copy();
+        }
+
+        input_ptr = &cpu_input_copy;
     }
 
     // run FFTW (which may destroy CPU input)
     apply_load_callback(params, *input_ptr);
-    fftw_run<Tfloat>(contiguous_params.transform_type,
-                     cpu_plan,
-                     input_ptr->front().data(),
-                     cpu_output.front().data());
+    fftw_run<Tfloat>(contiguous_params.transform_type, cpu_plan, *input_ptr, cpu_output);
     // clean up
     fftw_destroy_plan_type(cpu_plan);
     // ask FFTW to fully clean up, since it tries to cache plan details
@@ -550,40 +700,111 @@
 
 // execute the GPU transform
 template <class Tparams>
-inline void execute_gpu_fft(Tparams&            params,
-                            std::vector<void*>& pibuffer,
-                            std::vector<void*>& pobuffer,
-                            fftw_data_t&        gpu_output)
+inline void execute_gpu_fft(Tparams&              params,
+                            std::vector<void*>&   pibuffer,
+                            std::vector<void*>&   pobuffer,
+                            std::vector<hostbuf>& gpu_output,
+                            bool                  round_trip_inverse = false)
 {
     gpubuf_t<callback_test_data> load_cb_data_dev;
     gpubuf_t<callback_test_data> store_cb_data_dev;
     if(params.run_callbacks)
     {
-        void* load_cb_host = get_load_callback_host(params.itype, params.precision);
+        void* load_cb_host
+            = get_load_callback_host(params.itype, params.precision, round_trip_inverse);
 
         callback_test_data load_cb_data_host;
-        load_cb_data_host.scalar = params.load_cb_scalar;
-        load_cb_data_host.base   = pibuffer.front();
 
-        ASSERT_TRUE(hipSuccess == load_cb_data_dev.alloc(sizeof(callback_test_data)));
-        ASSERT_TRUE(hipSuccess
-                    == hipMemcpy(load_cb_data_dev.data(),
-                                 &load_cb_data_host,
-                                 sizeof(callback_test_data),
-                                 hipMemcpyHostToDevice));
+        if(round_trip_inverse)
+        {
+            load_cb_data_host.scalar = params.store_cb_scalar;
+        }
+        else
+        {
+            load_cb_data_host.scalar = params.load_cb_scalar;
+        }
+
+        load_cb_data_host.base = pibuffer.front();
+
+        auto hip_status = hipSuccess;
+
+        hip_status = load_cb_data_dev.alloc(sizeof(callback_test_data));
+        if(hip_status != hipSuccess)
+        {
+            ++n_hip_failures;
+            if(skip_runtime_fails)
+            {
+                GTEST_SKIP();
+            }
+            else
+            {
+                GTEST_FAIL();
+            }
+        }
+        hip_status = hipMemcpy(load_cb_data_dev.data(),
+                               &load_cb_data_host,
+                               sizeof(callback_test_data),
+                               hipMemcpyHostToDevice);
+        if(hip_status != hipSuccess)
+        {
+            ++n_hip_failures;
+            if(skip_runtime_fails)
+            {
+                GTEST_SKIP();
+            }
+            else
+            {
+                GTEST_FAIL();
+            }
+        }
 
-        void* store_cb_host = get_store_callback_host(params.otype, params.precision);
+        void* store_cb_host
+            = get_store_callback_host(params.otype, params.precision, round_trip_inverse);
 
         callback_test_data store_cb_data_host;
-        store_cb_data_host.scalar = params.store_cb_scalar;
-        store_cb_data_host.base   = pobuffer.front();
 
-        ASSERT_TRUE(hipSuccess == store_cb_data_dev.alloc(sizeof(callback_test_data)));
-        ASSERT_TRUE(hipSuccess
-                    == hipMemcpy(store_cb_data_dev.data(),
-                                 &store_cb_data_host,
-                                 sizeof(callback_test_data),
-                                 hipMemcpyHostToDevice));
+        if(round_trip_inverse)
+        {
+            store_cb_data_host.scalar = params.load_cb_scalar;
+        }
+        else
+        {
+            store_cb_data_host.scalar = params.store_cb_scalar;
+        }
+
+        store_cb_data_host.base = pobuffer.front();
+
+        hip_status = store_cb_data_dev.alloc(sizeof(callback_test_data));
+        if(hip_status != hipSuccess)
+        {
+            ++n_hip_failures;
+            if(skip_runtime_fails)
+            {
+                GTEST_SKIP();
+            }
+            else
+            {
+                GTEST_FAIL();
+            }
+        }
+
+        hip_status = hipMemcpy(store_cb_data_dev.data(),
+                               &store_cb_data_host,
+                               sizeof(callback_test_data),
+                               hipMemcpyHostToDevice);
+        if(hip_status != hipSuccess)
+        {
+            ++n_hip_failures;
+            if(skip_runtime_fails)
+            {
+                GTEST_SKIP();
+            }
+            else
+            {
+                GTEST_FAIL();
+            }
+        }
+
         auto fft_status = params.set_callbacks(
             load_cb_host, load_cb_data_dev.data(), store_cb_host, store_cb_data_dev.data());
         if(fft_status != fft_status_success)
@@ -596,18 +817,27 @@
         throw std::runtime_error("rocFFT plan execution failure");
 
     // copy GPU output back
-    ASSERT_TRUE(!params.osize.empty()) << "Error: params osize is empty";
-    gpu_output
-        = allocate_host_buffer<fftwAllocator<char>>(params.precision, params.otype, params.osize);
     ASSERT_TRUE(!gpu_output.empty()) << "no output buffers";
     for(unsigned int idx = 0; idx < gpu_output.size(); ++idx)
     {
-        ASSERT_TRUE(!gpu_output[idx].empty()) << "output buffer index " << idx << " is empty";
+        ASSERT_TRUE(gpu_output[idx].data() != nullptr)
+            << "output buffer index " << idx << " is empty";
         auto hip_status = hipMemcpy(gpu_output[idx].data(),
                                     pobuffer.at(idx),
                                     gpu_output[idx].size(),
                                     hipMemcpyDeviceToHost);
-        ASSERT_EQ(hip_status, hipSuccess) << "hipMemcpy failure";
+        if(hip_status != hipSuccess)
+        {
+            ++n_hip_failures;
+            if(skip_runtime_fails)
+            {
+                GTEST_SKIP() << "hipMemcpy failure";
+            }
+            else
+            {
+                GTEST_FAIL() << "hipMemcpy failure";
+            }
+        }
     }
     if(verbose > 2)
     {
@@ -622,57 +852,69 @@
 }
 
 template <typename Tfloat>
-static void assert_init_value(const fftw_data_t& output, const size_t idx, const Tfloat orig_value);
+static void assert_init_value(const std::vector<hostbuf>& output,
+                              const size_t                idx,
+                              const Tfloat                orig_value);
 
 template <>
-void assert_init_value(const fftw_data_t& output, const size_t idx, const float orig_value)
+void assert_init_value(const std::vector<hostbuf>& output, const size_t idx, const float orig_value)
 {
     float actual_value = reinterpret_cast<const float*>(output.front().data())[idx];
     ASSERT_EQ(actual_value, orig_value) << "index " << idx;
 }
 
 template <>
-void assert_init_value(const fftw_data_t& output, const size_t idx, const double orig_value)
+void assert_init_value(const std::vector<hostbuf>& output,
+                       const size_t                idx,
+                       const double                orig_value)
 {
     double actual_value = reinterpret_cast<const double*>(output.front().data())[idx];
     ASSERT_EQ(actual_value, orig_value) << "index " << idx;
 }
 
 template <>
-void assert_init_value(const fftw_data_t& output, const size_t idx, const float2 orig_value)
+void assert_init_value(const std::vector<hostbuf>& output,
+                       const size_t                idx,
+                       const rocfft_complex<float> orig_value)
 {
     // if this is interleaved, check directly
     if(output.size() == 1)
     {
-        float2 actual_value = reinterpret_cast<const float2*>(output.front().data())[idx];
+        rocfft_complex<float> actual_value
+            = reinterpret_cast<const rocfft_complex<float>*>(output.front().data())[idx];
         ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
         ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
     }
     else
     {
         // planar
-        float2 actual_value{reinterpret_cast<const float*>(output.front().data())[idx],
-                            reinterpret_cast<const float*>(output.back().data())[idx]};
+        rocfft_complex<float> actual_value{
+            reinterpret_cast<const float*>(output.front().data())[idx],
+            reinterpret_cast<const float*>(output.back().data())[idx]};
         ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
         ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
     }
 }
 
 template <>
-void assert_init_value(const fftw_data_t& output, const size_t idx, const double2 orig_value)
+void assert_init_value(const std::vector<hostbuf>&  output,
+                       const size_t                 idx,
+                       const rocfft_complex<double> orig_value)
 {
     // if this is interleaved, check directly
     if(output.size() == 1)
     {
-        double2 actual_value = reinterpret_cast<const double2*>(output.front().data())[idx];
+        rocfft_complex<double> actual_value
+            = reinterpret_cast<const rocfft_complex<double>*>(output.front().data())[idx];
         ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
         ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
     }
     else
     {
         // planar
-        double2 actual_value{reinterpret_cast<const double*>(output.front().data())[idx],
-                             reinterpret_cast<const double*>(output.back().data())[idx]};
+        rocfft_complex<double> actual_value{
+            reinterpret_cast<const double*>(output.front().data())[idx],
+            reinterpret_cast<const double*>(output.back().data())[idx]};
         ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
         ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
     }
@@ -680,11 +922,11 @@
 
 static const int OUTPUT_INIT_PATTERN = 0xcd;
 template <class Tfloat>
-void check_single_output_stride(const fftw_data_t&         output,
-                                const size_t               offset,
-                                const std::vector<size_t>& length,
-                                const std::vector<size_t>& stride,
-                                const size_t               i)
+void check_single_output_stride(const std::vector<hostbuf>& output,
+                                const size_t                offset,
+                                const std::vector<size_t>&  length,
+                                const std::vector<size_t>&  stride,
+                                const size_t                i)
 {
     Tfloat orig;
     memset(static_cast<void*>(&orig), OUTPUT_INIT_PATTERN, sizeof(Tfloat));
@@ -720,7 +962,7 @@
 }
 
 template <class Tparams>
-void check_output_strides(const fftw_data_t& output, Tparams& params)
+void check_output_strides(const std::vector<hostbuf>& output, Tparams& params)
 {
     // treat batch+dist like highest length+stride, if batch > 1
     std::vector<size_t> length;
@@ -740,46 +982,273 @@
         if(params.otype == fft_array_type_real)
             check_single_output_stride<float>(output, 0, length, stride, 0);
         else
-            check_single_output_stride<float2>(output, 0, length, stride, 0);
+            check_single_output_stride<rocfft_complex<float>>(output, 0, length, stride, 0);
     }
     else
     {
         if(params.otype == fft_array_type_real)
             check_single_output_stride<double>(output, 0, length, stride, 0);
         else
-            check_single_output_stride<double2>(output, 0, length, stride, 0);
+            check_single_output_stride<rocfft_complex<double>>(output, 0, length, stride, 0);
     }
 }
 
-// run CPU + rocFFT transform with the given params and compare
-template <class Tfloat, class Tparams>
-inline void fft_vs_reference_impl(Tparams& params)
+// run rocFFT inverse transform
+template <class Tparams>
+inline void run_round_trip_inverse(Tparams&              params,
+                                   std::vector<gpubuf>&  obuffer,
+                                   std::vector<void*>&   pibuffer,
+                                   std::vector<void*>&   pobuffer,
+                                   std::vector<hostbuf>& gpu_output)
 {
+    params.validate();
+
     // Make sure that the parameters make sense:
     ASSERT_TRUE(params.valid(verbose));
 
-    if(ramgb > 0 && needed_ram(params, verbose) > ramgb * ONE_GiB)
+    // Create FFT plan - this will also allocate work buffer, but will throw a
+    // specific exception if that step fails
+    auto plan_status = fft_status_success;
+    try
     {
-        if(verbose)
+        plan_status = params.create_plan();
+    }
+    catch(fft_params::work_buffer_alloc_failure& e)
+    {
+        std::stringstream ss;
+        ss << "Failed to allocate work buffer (size: " << params.workbuffersize << ")";
+        ++n_hip_failures;
+        if(skip_runtime_fails)
         {
-            std::cout << "Problem exceeds memory limit; skipped [rocfft_transform]." << std::endl;
+            GTEST_SKIP() << ss.str();
+        }
+        else
+        {
+            GTEST_FAIL() << ss.str();
         }
-        GTEST_SKIP();
-        return;
+    }
+    ASSERT_EQ(plan_status, fft_status_success) << "round trip inverse plan creation failed";
+
+    auto obuffer_sizes = params.obuffer_sizes();
+
+    if(params.placement != fft_placement_inplace)
+    {
+        for(unsigned int i = 0; i < obuffer_sizes.size(); ++i)
+        {
+            // If we're validating output strides, init the
+            // output buffer to a known pattern and we can check
+            // that the pattern is untouched in places that
+            // shouldn't have been touched.
+            if(params.check_output_strides)
+            {
+                auto hip_status
+                    = hipMemset(obuffer[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]);
+                if(hip_status != hipSuccess)
+                {
+                    ++n_hip_failures;
+                    if(skip_runtime_fails)
+                    {
+                        GTEST_SKIP() << "hipMemset failure";
+                    }
+                    else
+                    {
+                        GTEST_FAIL() << "hipMemset failure";
+                    }
+                }
+            }
+        }
+    }
+
+    // execute GPU transform
+    //
+    // limited scope for local variables
+
+    execute_gpu_fft(params, pibuffer, pobuffer, gpu_output, true);
+}
+
+// compare rocFFT inverse transform with forward transform input
+template <class Tparams>
+inline void compare_round_trip_inverse(Tparams&              params,
+                                       fft_params&           contiguous_params,
+                                       std::vector<hostbuf>& gpu_output,
+                                       std::vector<hostbuf>& cpu_input,
+                                       const VectorNorms&    cpu_input_norm,
+                                       size_t                total_length)
+{
+    if(params.check_output_strides)
+    {
+        check_output_strides<Tparams>(gpu_output, params);
+    }
+
+    // compute GPU output norm
+    std::shared_future<VectorNorms> gpu_norm = std::async(std::launch::async, [&]() {
+        return norm(gpu_output,
+                    params.olength(),
+                    params.nbatch,
+                    params.precision,
+                    params.otype,
+                    params.ostride,
+                    params.odist,
+                    params.ooffset);
+    });
+
+    // compare GPU inverse output to CPU forward input
+    std::unique_ptr<std::vector<std::pair<size_t, size_t>>> linf_failures;
+    if(verbose > 1)
+        linf_failures = std::make_unique<std::vector<std::pair<size_t, size_t>>>();
+    const double linf_cutoff
+        = type_epsilon(params.precision) * cpu_input_norm.l_inf * log(total_length);
+
+    VectorNorms diff = distance(cpu_input,
+                                gpu_output,
+                                params.olength(),
+                                params.nbatch,
+                                params.precision,
+                                contiguous_params.itype,
+                                contiguous_params.istride,
+                                contiguous_params.idist,
+                                params.otype,
+                                params.ostride,
+                                params.odist,
+                                linf_failures.get(),
+                                linf_cutoff,
+                                {0},
+                                params.ooffset,
+                                1.0 / total_length);
+
+    if(verbose > 1)
+    {
+        std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n";
+        std::cout << "GPU output L2 norm:   " << gpu_norm.get().l_2 << "\n";
+        std::cout << "GPU linf norm failures:";
+        std::sort(linf_failures->begin(), linf_failures->end());
+        for(const auto& i : *linf_failures)
+        {
+            std::cout << " (" << i.first << "," << i.second << ")";
+        }
+        std::cout << std::endl;
+    }
+
+    EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str();
+    EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str();
+
+    switch(params.precision)
+    {
+    case fft_precision_half:
+        max_linf_eps_half
+            = std::max(max_linf_eps_half, diff.l_inf / cpu_input_norm.l_inf / log(total_length));
+        max_l2_eps_half
+            = std::max(max_l2_eps_half, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length)));
+        break;
+    case fft_precision_single:
+        max_linf_eps_single
+            = std::max(max_linf_eps_single, diff.l_inf / cpu_input_norm.l_inf / log(total_length));
+        max_l2_eps_single
+            = std::max(max_l2_eps_single, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length)));
+        break;
+    case fft_precision_double:
+        max_linf_eps_double
+            = std::max(max_linf_eps_double, diff.l_inf / cpu_input_norm.l_inf / log(total_length));
+        max_l2_eps_double
+            = std::max(max_l2_eps_double, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length)));
+        break;
+    }
+
+    if(verbose > 1)
+    {
+        std::cout << "L2 diff: " << diff.l_2 << "\n";
+        std::cout << "Linf diff: " << diff.l_inf << "\n";
+    }
+
+    EXPECT_TRUE(diff.l_inf <= linf_cutoff)
+        << "Linf test failed.  Linf:" << diff.l_inf
+        << "\tnormalized Linf: " << diff.l_inf / cpu_input_norm.l_inf << "\tcutoff: " << linf_cutoff
+        << params.str();
+
+    EXPECT_TRUE(diff.l_2 / cpu_input_norm.l_2
+                < sqrt(log2(total_length)) * type_epsilon(params.precision))
+        << "L2 test failed. L2: " << diff.l_2
+        << "\tnormalized L2: " << diff.l_2 / cpu_input_norm.l_2
+        << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision)
+        << params.str();
+}
+
+// RAII type to put data into the cache when this object leaves scope
+struct StoreCPUDataToCache
+{
+    StoreCPUDataToCache(std::vector<hostbuf>& cpu_input, std::vector<hostbuf>& cpu_output)
+        : cpu_input(cpu_input)
+        , cpu_output(cpu_output)
+    {
+    }
+    ~StoreCPUDataToCache()
+    {
+        last_cpu_fft_data.cpu_output.swap(cpu_output);
+        last_cpu_fft_data.cpu_input.swap(cpu_input);
+    }
+    std::vector<hostbuf>& cpu_input;
+    std::vector<hostbuf>& cpu_output;
+};
+
+// run CPU + rocFFT transform with the given params and compare
+template <class Tfloat, class Tparams>
+inline void fft_vs_reference_impl(Tparams& params, bool round_trip)
+{
+    // Make sure that the parameters make sense:
+    ASSERT_TRUE(params.valid(verbose));
+
+    size_t needed_ram = needed_ram_buffers(params, verbose);
+
+    if(ramgb > 0 && needed_ram > ramgb * ONE_GiB)
+    {
+        GTEST_SKIP() << "needed_ramgb: " << bytes_to_GiB(needed_ram) << ", ramgb limit: " << ramgb
+                     << ".\n";
     }
 
     auto ibuffer_sizes = params.ibuffer_sizes();
     auto obuffer_sizes = params.obuffer_sizes();
 
+    size_t vram_avail = 0;
+
+    if(vramgb == 0)
+    {
+        // Check free and total available memory:
+        size_t free       = 0;
+        size_t total      = 0;
+        auto   hip_status = hipMemGetInfo(&free, &total);
+        if(hip_status != hipSuccess || total == 0)
+        {
+            ++n_hip_failures;
+            std::stringstream ss;
+            if(total == 0)
+                ss << "hipMemGetInfo claims there there isn't any vram";
+            else
+                ss << "hipMemGetInfo failure with error " << hip_status;
+            if(skip_runtime_fails)
+            {
+                GTEST_SKIP() << ss.str();
+            }
+            else
+            {
+                GTEST_FAIL() << ss.str();
+            }
+        }
+        vram_avail = total;
+    }
+    else
+    {
+        vram_avail = vramgb * ONE_GiB;
+    }
+
     // First try a quick estimation of vram footprint, to speed up skipping tests
     // that are too large to fit in the gpu (no plan created with the rocFFT backend)
     const auto raw_vram_footprint
         = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params);
 
-    if(!vram_fits_problem(raw_vram_footprint))
+    if(!vram_fits_problem(raw_vram_footprint, vram_avail))
     {
-        GTEST_SKIP() << "Raw problem size (" << raw_vram_footprint
-                     << ") raw data too large for device";
+        GTEST_SKIP() << "Raw problem size (" << bytes_to_GiB(raw_vram_footprint)
+                     << " GiB) raw data too large for device";
     }
 
     if(verbose > 2)
@@ -791,41 +1260,40 @@
     // accurate calculation that actually creates the plan and
     // take into account the work buffer size
     const auto vram_footprint = params.vram_footprint();
-    if(!vram_fits_problem(vram_footprint))
+    if(!vram_fits_problem(vram_footprint, vram_avail))
     {
         if(verbose)
         {
             std::cout << "Problem raw data won't fit on device; skipped." << std::endl;
         }
-        GTEST_SKIP() << "Problem size (" << vram_footprint << ") raw data too large for device";
+        GTEST_SKIP() << "Problem size (" << bytes_to_GiB(vram_footprint)
+                     << " GiB) raw data too large for device";
     }
 
     // Create FFT plan - this will also allocate work buffer, but
     // will throw a specific exception if that step fails
+    auto plan_status = fft_status_success;
     try
     {
-        ASSERT_EQ(params.create_plan(), fft_status_success);
+        plan_status = params.create_plan();
     }
     catch(fft_params::work_buffer_alloc_failure& e)
     {
-        GTEST_SKIP() << "Problem size with work buffer (" << vram_footprint + params.workbuffersize
-                     << ") too large for device";
+        ++n_hip_failures;
+        std::stringstream ss;
+        ss << "Work buffer allocation failed with size: " << params.workbuffersize;
+        if(skip_runtime_fails)
+        {
+            GTEST_SKIP() << ss.str();
+        }
+        else
+        {
+            GTEST_FAIL() << ss.str();
+        }
     }
+    ASSERT_EQ(plan_status, fft_status_success) << "plan creation failed";
 
-    // Recheck whether the raw data fits on the device, now that the
-    // work buffer has been allocated (if required).
-    if(verbose > 1)
-    {
-        size_t     free   = 0;
-        size_t     total  = 0;
-        hipError_t retval = hipMemGetInfo(&free, &total);
-        ASSERT_EQ(retval, hipSuccess) << "hipMemGetInfo failed with error " << retval;
-        std::cout << "data footprint: " << vram_footprint << " (" << (double)vram_footprint
-                  << ") workbuffer: " << params.workbuffersize << " ("
-                  << (double)params.workbuffersize << ") free: " << free << " (" << (double)free
-                  << ") total: " << total << " (" << (double)total << ")\n";
-    }
-    if(!vram_fits_problem(vram_footprint))
+    if(!vram_fits_problem(vram_footprint, vram_avail))
     {
         if(verbose)
         {
@@ -853,30 +1321,31 @@
 
     if(verbose > 3)
     {
-        std::cout << "CPU  params:\n";
+        std::cout << "CPU params:\n";
         std::cout << contiguous_params.str("\n\t") << std::endl;
     }
 
-    // helper function to convert double input/output to float
-    // in-place so we don't need extra memory
-    auto convert_to_single = [](fftw_data_t& data) {
-        for(auto& arr : data)
-        {
-            const double* readPtr  = reinterpret_cast<const double*>(arr.data());
-            const double* readEnd  = readPtr + (arr.size() / sizeof(double));
-            float*        writePtr = reinterpret_cast<float*>(arr.data());
-            std::copy(readPtr, readEnd, writePtr);
-            arr.resize(arr.size() / 2);
-        }
-    };
-
     std::vector<gpubuf> ibuffer(ibuffer_sizes.size());
     std::vector<void*>  pibuffer(ibuffer_sizes.size());
     for(unsigned int i = 0; i < ibuffer.size(); ++i)
     {
         auto hip_status = ibuffer[i].alloc(ibuffer_sizes[i]);
-        ASSERT_EQ(hip_status, hipSuccess) << "hipMalloc failure for input buffer " << i << " size "
-                                          << ibuffer_sizes[i] << " " << params.str();
+        if(hip_status != hipSuccess)
+        {
+            std::stringstream ss;
+            ss << "hipMalloc failure for input buffer " << i << " size " << ibuffer_sizes[i] << "("
+               << bytes_to_GiB(ibuffer_sizes[i]) << " GiB)"
+               << " with code " << hipError_to_string(hip_status);
+            ++n_hip_failures;
+            if(skip_runtime_fails)
+            {
+                GTEST_SKIP() << ss.str();
+            }
+            else
+            {
+                GTEST_FAIL() << ss.str();
+            }
+        }
         pibuffer[i] = ibuffer[i].data();
     }
 
@@ -888,11 +1357,12 @@
     // Check cache first - nbatch is a >= comparison because we compute
     // the largest batch size and cache it.  Smaller batch runs can
     // compare against the larger data.
-    fftw_data_t              cpu_input;
-    fftw_data_t              cpu_output;
-    std::shared_future<void> convert_cpu_output_precision;
-    std::shared_future<void> convert_cpu_input_precision;
-    bool                     run_fftw = true;
+    std::vector<hostbuf>                 cpu_input;
+    std::vector<hostbuf>                 cpu_output;
+    std::shared_future<void>             convert_cpu_output_precision;
+    std::shared_future<void>             convert_cpu_input_precision;
+    bool                                 run_fftw = true;
+    std::unique_ptr<StoreCPUDataToCache> store_to_cache;
     if(last_cpu_fft_data.length == params.length
        && last_cpu_fft_data.transform_type == params.transform_type
        && last_cpu_fft_data.run_callbacks == params.run_callbacks)
@@ -904,26 +1374,64 @@
             cpu_output.swap(last_cpu_fft_data.cpu_output);
             run_fftw = false;
 
+            store_to_cache = std::make_unique<StoreCPUDataToCache>(cpu_input, cpu_output);
+
             if(params.precision != last_cpu_fft_data.precision)
             {
-                // Tests should be ordered so we do double first, then float.
-                if(last_cpu_fft_data.precision == fft_precision_double)
+                // Tests should be ordered so we do wider first, then narrower.
+                switch(params.precision)
                 {
-                    // convert the input/output to single-precision
-                    convert_cpu_output_precision
-                        = std::async(std::launch::async, [&]() { convert_to_single(cpu_output); });
-                    convert_cpu_input_precision
-                        = std::async(std::launch::async, [&]() { convert_to_single(cpu_input); });
-                    last_cpu_fft_data.precision = fft_precision_single;
-                }
-                else
-                {
-                    // Somehow we've done float first, then double?
-                    // Tests are ordered wrong, and we don't want to
-                    // lose precision
-                    std::cerr << "Can't do float first then double: aborting." << std::endl;
+                case fft_precision_double:
+                    std::cerr
+                        << "test ordering is incorrect: double precision follows a narrower one"
+                        << std::endl;
                     abort();
+                    break;
+                case fft_precision_single:
+                    if(last_cpu_fft_data.precision != fft_precision_double)
+                    {
+                        std::cerr
+                            << "test ordering is incorrect: float precision follows a narrower one"
+                            << std::endl;
+                        abort();
+                    }
+                    // convert the input/output to single-precision
+                    convert_cpu_output_precision = std::async(std::launch::async, [&]() {
+                        narrow_precision_inplace<double, float>(cpu_output.front());
+                    });
+                    convert_cpu_input_precision  = std::async(std::launch::async, [&]() {
+                        narrow_precision_inplace<double, float>(cpu_input.front());
+                    });
+                    break;
+                case fft_precision_half:
+                    // convert to half precision
+                    if(last_cpu_fft_data.precision == fft_precision_double)
+                    {
+                        convert_cpu_output_precision = std::async(std::launch::async, [&]() {
+                            narrow_precision_inplace<double, _Float16>(cpu_output.front());
+                        });
+                        convert_cpu_input_precision  = std::async(std::launch::async, [&]() {
+                            narrow_precision_inplace<double, _Float16>(cpu_input.front());
+                        });
+                    }
+                    else if(last_cpu_fft_data.precision == fft_precision_single)
+                    {
+                        convert_cpu_output_precision = std::async(std::launch::async, [&]() {
+                            narrow_precision_inplace<float, _Float16>(cpu_output.front());
+                        });
+                        convert_cpu_input_precision  = std::async(std::launch::async, [&]() {
+                            narrow_precision_inplace<float, _Float16>(cpu_input.front());
+                        });
+                    }
+                    else
+                    {
+                        std::cerr << "unhandled previous precision, cannot convert to half"
+                                  << std::endl;
+                        abort();
+                    }
+                    break;
                 }
+                last_cpu_fft_data.precision = params.precision;
             }
         }
         // If the last result has a smaller batch than the new
@@ -933,14 +1441,17 @@
         // might never have tried to generate the bigger batch first.
         // So just fall through and redo the CPU FFT.
     }
-    // Clear cache explicitly so that even if we didn't get a hit,
-    // we're not uselessly holding on to cached cpu input/output
-    last_cpu_fft_data = last_cpu_fft_cache();
+    else
+    {
+        // Clear cache explicitly so that even if we didn't get a hit,
+        // we're not uselessly holding on to cached cpu input/output
+        last_cpu_fft_data = last_cpu_fft_cache();
+    }
 
     // Allocate CPU input
     if(run_fftw)
     {
-        cpu_input = allocate_host_buffer<fftwAllocator<char>>(
+        cpu_input = allocate_cpu_fft_buffer(
             contiguous_params.precision, contiguous_params.itype, contiguous_params.isize);
     }
 
@@ -955,7 +1466,7 @@
         // creation time.
         if(use_fftw_wisdom)
         {
-            cpu_output = allocate_host_buffer<fftwAllocator<char>>(
+            cpu_output = allocate_cpu_fft_buffer(
                 contiguous_params.precision, contiguous_params.otype, contiguous_params.osize);
         }
         cpu_plan = fftw_plan_via_rocfft<Tfloat>(contiguous_params.length,
@@ -967,32 +1478,56 @@
                                                 contiguous_params.transform_type,
                                                 cpu_input,
                                                 cpu_output);
+
+        needed_ram += needed_ram_fftw<Tfloat>(contiguous_params, cpu_plan, verbose);
+
+        if(ramgb > 0 && needed_ram > ramgb * ONE_GiB)
+        {
+            if(verbose)
+            {
+                std::cout << "Problem exceeds memory limit; skipped [rocfft_transform]."
+                          << std::endl;
+            }
+            GTEST_SKIP();
+            return;
+        }
     }
 
+    std::vector<hostbuf> gpu_input_data
+        = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems);
+
     // allocate and populate the input buffer (cpu/gpu)
     if(run_fftw)
     {
         //generate the input directly on the gpu
-        compute_input(params, ibuffer);
+        params.compute_input(ibuffer);
 
         // Copy the input to CPU
         if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride
            || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize)
         {
-            auto tmp_cpu_input = allocate_host_buffer<fftwAllocator<char>>(
-                params.precision, params.itype, ibuffer_sizes_elems);
-
             // Copy input to CPU
             for(unsigned int idx = 0; idx < ibuffer.size(); ++idx)
             {
-                auto hip_status = hipMemcpy(tmp_cpu_input.at(idx).data(),
+                auto hip_status = hipMemcpy(gpu_input_data.at(idx).data(),
                                             ibuffer[idx].data(),
                                             ibuffer_sizes[idx],
                                             hipMemcpyDeviceToHost);
-                ASSERT_EQ(hip_status, hipSuccess) << "hipMemcpy failure with error " << hip_status;
+                if(hip_status != hipSuccess)
+                {
+                    ++n_hip_failures;
+                    if(skip_runtime_fails)
+                    {
+                        GTEST_SKIP() << "hipMemcpy failure with error " << hip_status;
+                    }
+                    else
+                    {
+                        GTEST_FAIL() << "hipMemcpy failure with error " << hip_status;
+                    }
+                }
             }
 
-            copy_buffers(tmp_cpu_input,
+            copy_buffers(gpu_input_data,
                          cpu_input,
                          params.ilength(),
                          params.nbatch,
@@ -1015,7 +1550,18 @@
                                             ibuffer[idx].data(),
                                             ibuffer_sizes[idx],
                                             hipMemcpyDeviceToHost);
-                ASSERT_EQ(hip_status, hipSuccess) << "hipMemcpy failure with error " << hip_status;
+                if(hip_status != hipSuccess)
+                {
+                    ++n_hip_failures;
+                    if(skip_runtime_fails)
+                    {
+                        GTEST_SKIP() << "hipMemcpy failure with error " << hip_status;
+                    }
+                    else
+                    {
+                        GTEST_FAIL() << "hipMemcpy failure with error " << hip_status;
+                    }
+                }
             }
         }
     }
@@ -1026,17 +1572,13 @@
             convert_cpu_input_precision.get();
 
         // gets a pre-computed gpu input buffer from the cpu cache
-        fftw_data_t  temp_gpu_input;
-        fftw_data_t* gpu_input = &cpu_input;
+        std::vector<hostbuf>* gpu_input = &cpu_input;
 
         if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride
            || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize)
         {
-            temp_gpu_input = allocate_host_buffer<fftwAllocator<char>>(
-                params.precision, params.itype, ibuffer_sizes_elems);
-
             copy_buffers(cpu_input,
-                         temp_gpu_input,
+                         gpu_input_data,
                          params.ilength(),
                          params.nbatch,
                          params.precision,
@@ -1048,7 +1590,7 @@
                          params.idist,
                          {0},
                          params.ioffset);
-            gpu_input = &temp_gpu_input;
+            gpu_input = &gpu_input_data;
         }
 
         // Copy input to GPU
@@ -1058,7 +1600,19 @@
                                         gpu_input->at(idx).data(),
                                         ibuffer_sizes[idx],
                                         hipMemcpyHostToDevice);
-            ASSERT_EQ(hip_status, hipSuccess) << "hipMemcpy failure with error " << hip_status;
+
+            if(hip_status != hipSuccess)
+            {
+                ++n_hip_failures;
+                if(skip_runtime_fails)
+                {
+                    GTEST_SKIP() << "hipMemcpy failure with error " << hip_status;
+                }
+                else
+                {
+                    GTEST_FAIL() << "hipMemcpy failure with error " << hip_status;
+                }
+            }
         }
     }
 
@@ -1109,25 +1663,20 @@
             auto hip_status = obuffer_data[i].alloc(obuffer_sizes[i]);
             if(hip_status != hipSuccess)
             {
-                // Try and figure out why hip malloc failed.
-                size_t     free   = 0;
-                size_t     total  = 0;
-                hipError_t retval = hipMemGetInfo(&free, &total);
-                EXPECT_EQ(retval, hipSuccess) << "hipMemGetInfo failed with error " << retval;
-                if(retval == hipSuccess)
+                ++n_hip_failures;
+                std::stringstream ss;
+                ss << "hipMalloc failure for output buffer " << i << " size " << obuffer_sizes[i]
+                   << "(" << bytes_to_GiB(obuffer_sizes[i]) << " GiB)"
+                   << " with code " << hipError_to_string(hip_status);
+                if(skip_runtime_fails)
                 {
-                    std::cerr << "free vram: " << free << " (" << (double)free
-                              << ") total vram: " << total << " (" << (double)total << ")"
-                              << std::endl;
-                    if(free > obuffer_sizes[i])
-                    {
-                        std::cerr << "The system reports that there is enough space." << std::endl;
-                    }
+                    GTEST_SKIP() << ss.str();
+                }
+                else
+                {
+                    GTEST_FAIL() << ss.str();
                 }
             }
-            ASSERT_EQ(hip_status, hipSuccess)
-                << "hipMalloc failure for output buffer " << i << " size " << obuffer_sizes[i]
-                << " (" << static_cast<double>(obuffer_sizes[i]) << ") " << params.str();
 
             // If we're validating output strides, init the
             // output buffer to a known pattern and we can check
@@ -1137,7 +1686,18 @@
             {
                 hip_status
                     = hipMemset(obuffer_data[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]);
-                ASSERT_EQ(hip_status, hipSuccess) << "hipMemset failure";
+                if(hip_status != hipSuccess)
+                {
+                    ++n_hip_failures;
+                    if(skip_runtime_fails)
+                    {
+                        GTEST_SKIP() << "hipMemset failure with error " << hip_status;
+                    }
+                    else
+                    {
+                        GTEST_FAIL() << "hipMemset failure with error " << hip_status;
+                    }
+                }
             }
         }
     }
@@ -1186,8 +1746,11 @@
     // execute GPU transform
     //
     // limited scope for local variables
-    fftw_data_t gpu_output;
+    std::vector<hostbuf> gpu_output
+        = allocate_host_buffer(params.precision, params.otype, params.osize);
+
     execute_gpu_fft(params, pibuffer, pobuffer, gpu_output);
+    params.free();
 
     if(params.check_output_strides)
     {
@@ -1210,29 +1773,74 @@
     //
     // Compute the l-infinity and l-2 distance between the CPU and GPU output:
     // wait for cpu FFT so we can compute cutoff
-    cpu_fft.get();
-    std::vector<std::pair<size_t, size_t>> linf_failures;
-    const auto                             total_length = std::accumulate(params.length.begin(),
+
+    const auto total_length = std::accumulate(params.length.begin(),
                                               params.length.end(),
                                               static_cast<size_t>(1),
                                               std::multiplies<size_t>());
-    const double                           linf_cutoff
-        = type_epsilon(params.precision) * cpu_output_norm.l_inf * log(total_length);
-    VectorNorms diff = distance(cpu_output,
-                                gpu_output,
-                                params.olength(),
-                                params.nbatch,
-                                params.precision,
-                                contiguous_params.otype,
-                                contiguous_params.ostride,
-                                contiguous_params.odist,
-                                params.otype,
-                                params.ostride,
-                                params.odist,
-                                linf_failures,
-                                linf_cutoff,
-                                {0},
-                                params.ooffset);
+
+    std::unique_ptr<std::vector<std::pair<size_t, size_t>>> linf_failures;
+    if(verbose > 1)
+        linf_failures = std::make_unique<std::vector<std::pair<size_t, size_t>>>();
+    double      linf_cutoff;
+    VectorNorms diff;
+
+    std::shared_future<void> compare_output = std::async(std::launch::async, [&]() {
+        cpu_fft.get();
+        linf_cutoff = type_epsilon(params.precision) * cpu_output_norm.l_inf * log(total_length);
+
+        diff = distance(cpu_output,
+                        gpu_output,
+                        params.olength(),
+                        params.nbatch,
+                        params.precision,
+                        contiguous_params.otype,
+                        contiguous_params.ostride,
+                        contiguous_params.odist,
+                        params.otype,
+                        params.ostride,
+                        params.odist,
+                        linf_failures.get(),
+                        linf_cutoff,
+                        {0},
+                        params.ooffset);
+    });
+
+    // Update the cache if this current transform is different from
+    // what's stored.  But if this transform only has a smaller batch
+    // than what's cached, we can still keep the cache around since
+    // the input/output we already have is still valid.
+    const bool update_last_cpu_fft_data
+        = last_cpu_fft_data.length != params.length
+          || last_cpu_fft_data.transform_type != params.transform_type
+          || last_cpu_fft_data.run_callbacks != params.run_callbacks
+          || last_cpu_fft_data.precision != params.precision
+          || params.nbatch > last_cpu_fft_data.nbatch;
+
+    // store cpu output in cache
+    if(update_last_cpu_fft_data)
+    {
+        last_cpu_fft_data.length         = params.length;
+        last_cpu_fft_data.nbatch         = params.nbatch;
+        last_cpu_fft_data.transform_type = params.transform_type;
+        last_cpu_fft_data.run_callbacks  = params.run_callbacks;
+        last_cpu_fft_data.precision      = params.precision;
+    }
+
+    compare_output.get();
+
+    if(!store_to_cache)
+        store_to_cache = std::make_unique<StoreCPUDataToCache>(cpu_input, cpu_output);
+
+    Tparams params_inverse;
+
+    if(round_trip)
+    {
+        params_inverse.inverse_from_forward(params);
+
+        run_round_trip_inverse<Tparams>(
+            params_inverse, ibuffer, pobuffer, pibuffer, gpu_input_data);
+    }
 
     ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_2));
     ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_inf));
@@ -1245,8 +1853,8 @@
         std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n";
         std::cout << "GPU output L2 norm:   " << gpu_norm.get().l_2 << "\n";
         std::cout << "GPU linf norm failures:";
-        std::sort(linf_failures.begin(), linf_failures.end());
-        for(const auto& i : linf_failures)
+        std::sort(linf_failures->begin(), linf_failures->end());
+        for(const auto& i : *linf_failures)
         {
             std::cout << " (" << i.first << "," << i.second << ")";
         }
@@ -1258,6 +1866,12 @@
 
     switch(params.precision)
     {
+    case fft_precision_half:
+        max_linf_eps_half
+            = std::max(max_linf_eps_half, diff.l_inf / cpu_output_norm.l_inf / log(total_length));
+        max_l2_eps_half
+            = std::max(max_l2_eps_half, diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length)));
+        break;
     case fft_precision_single:
         max_linf_eps_single
             = std::max(max_linf_eps_single, diff.l_inf / cpu_output_norm.l_inf / log(total_length));
@@ -1290,14 +1904,15 @@
         << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision)
         << params.str();
 
-    // store cpu output in cache
-    last_cpu_fft_data.length         = params.length;
-    last_cpu_fft_data.nbatch         = params.nbatch;
-    last_cpu_fft_data.transform_type = params.transform_type;
-    last_cpu_fft_data.run_callbacks  = params.run_callbacks;
-    last_cpu_fft_data.precision      = params.precision;
-    last_cpu_fft_data.cpu_output.swap(cpu_output);
-    last_cpu_fft_data.cpu_input.swap(cpu_input);
+    if(round_trip)
+    {
+        compare_round_trip_inverse<Tparams>(params_inverse,
+                                            contiguous_params,
+                                            gpu_input_data,
+                                            cpu_input,
+                                            cpu_input_norm.get(),
+                                            total_length);
+    }
 }
 
 #endif
diff -Nru rocfft-5.5.0/clients/tests/accuracy_test_1D.cpp rocfft-5.7.1/clients/tests/accuracy_test_1D.cpp
--- rocfft-5.5.0/clients/tests/accuracy_test_1D.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/accuracy_test_1D.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -36,6 +36,9 @@
        131072,   262144,   524288,    1048576,   2097152,   4194304,   8388608, 16777216,
        33554432, 67108864, 134217728, 268435456, 536870912, 1073741824};
 
+const static std::vector<size_t> pow2_range_half
+    = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536};
+
 const static std::vector<size_t> pow3_range = {3,
                                                9,
                                                27,
@@ -129,7 +132,7 @@
 INSTANTIATE_TEST_SUITE_P(pow2_1D,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({pow2_range}),
-                                                             precision_range,
+                                                             precision_range_sp_dp,
                                                              batch_range_1D,
                                                              stride_range,
                                                              stride_range,
@@ -138,10 +141,11 @@
                                                              place_range,
                                                              true)),
                          accuracy_test::TestName);
+
 INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_1D,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({pow2_range}),
-                                                             precision_range,
+                                                             precision_range_sp_dp,
                                                              batch_range_1D,
                                                              stride_range,
                                                              stride_range,
@@ -151,10 +155,36 @@
                                                              true)),
                          accuracy_test::TestName);
 
+INSTANTIATE_TEST_SUITE_P(pow2_1D_half,
+                         accuracy_test,
+                         ::testing::ValuesIn(param_generator(generate_lengths({pow2_range_half}),
+                                                             {fft_precision_half},
+                                                             batch_range_1D,
+                                                             stride_range,
+                                                             stride_range,
+                                                             ioffset_range_zero,
+                                                             ooffset_range_zero,
+                                                             place_range,
+                                                             true)),
+                         accuracy_test::TestName);
+
+INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_1D_half,
+                         accuracy_test,
+                         ::testing::ValuesIn(param_generator(generate_lengths({pow2_range_half}),
+                                                             {fft_precision_half},
+                                                             batch_range_1D,
+                                                             stride_range,
+                                                             stride_range,
+                                                             ioffset_range_zero,
+                                                             ooffset_range_zero,
+                                                             place_range,
+                                                             true)),
+                         accuracy_test::TestName);
+
 INSTANTIATE_TEST_SUITE_P(pow3_1D,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({pow3_range}),
-                                                             precision_range,
+                                                             precision_range_sp_dp,
                                                              batch_range_1D,
                                                              stride_range,
                                                              stride_range,
@@ -166,7 +196,7 @@
 INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow3_1D,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({pow3_range}),
-                                                             precision_range,
+                                                             precision_range_full,
                                                              batch_range_1D,
                                                              stride_range,
                                                              stride_range,
@@ -179,7 +209,7 @@
 INSTANTIATE_TEST_SUITE_P(pow5_1D,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({pow5_range}),
-                                                             precision_range,
+                                                             precision_range_sp_dp,
                                                              batch_range_1D,
                                                              stride_range,
                                                              stride_range,
@@ -191,7 +221,7 @@
 INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow5_1D,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({pow5_range}),
-                                                             precision_range,
+                                                             precision_range_full,
                                                              batch_range_1D,
                                                              stride_range,
                                                              stride_range,
@@ -204,7 +234,7 @@
 INSTANTIATE_TEST_SUITE_P(radX_1D,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({radX_range}),
-                                                             precision_range,
+                                                             precision_range_full,
                                                              batch_range_1D,
                                                              stride_range,
                                                              stride_range,
@@ -216,7 +246,7 @@
 INSTANTIATE_TEST_SUITE_P(DISABLED_offset_radX_1D,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({radX_range}),
-                                                             precision_range,
+                                                             precision_range_full,
                                                              batch_range_1D,
                                                              stride_range,
                                                              stride_range,
@@ -229,7 +259,7 @@
 INSTANTIATE_TEST_SUITE_P(prime_1D,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({prime_range}),
-                                                             precision_range,
+                                                             precision_range_sp_dp,
                                                              batch_range_1D,
                                                              stride_range,
                                                              stride_range,
@@ -241,7 +271,7 @@
 INSTANTIATE_TEST_SUITE_P(DISABLED_offset_prime_1D,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({prime_range}),
-                                                             precision_range,
+                                                             precision_range_sp_dp,
                                                              batch_range_1D,
                                                              stride_range,
                                                              stride_range,
@@ -254,7 +284,7 @@
 INSTANTIATE_TEST_SUITE_P(mix_1D,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({mix_range}),
-                                                             precision_range,
+                                                             precision_range_full,
                                                              batch_range_1D,
                                                              stride_range,
                                                              stride_range,
@@ -266,7 +296,7 @@
 INSTANTIATE_TEST_SUITE_P(DISABLED_offset_mix_1D,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({mix_range}),
-                                                             precision_range,
+                                                             precision_range_full,
                                                              batch_range_1D,
                                                              stride_range,
                                                              stride_range,
@@ -312,14 +342,30 @@
 //
 // The below test covers non-unit strides, pow of 2, middle sizes, which has SBCC/SBRC kernels
 // invloved.
-const static std::vector<size_t>              pow2_range_for_stride  = {4096, 8192, 524288};
-const static std::vector<std::vector<size_t>> stride_range_for_pow2  = {{2}, {3}};
-const static std::vector<size_t>              batch_range_for_stride = {2, 1};
+const static std::vector<size_t>              pow2_range_for_stride      = {4096, 8192, 524288};
+const static std::vector<size_t>              pow2_range_for_stride_half = {4096, 8192};
+const static std::vector<std::vector<size_t>> stride_range_for_pow2      = {{2}, {3}};
+const static std::vector<size_t>              batch_range_for_stride     = {2, 1};
+
 INSTANTIATE_TEST_SUITE_P(
     pow2_1D_stride_complex,
     accuracy_test,
     ::testing::ValuesIn(param_generator_complex(generate_lengths({pow2_range_for_stride}),
-                                                precision_range,
+                                                precision_range_sp_dp,
+                                                batch_range_1D,
+                                                stride_range_for_pow2,
+                                                stride_range_for_pow2,
+                                                ioffset_range_zero,
+                                                ooffset_range_zero,
+                                                place_range,
+                                                true)),
+    accuracy_test::TestName);
+
+INSTANTIATE_TEST_SUITE_P(
+    pow2_1D_stride_complex_half,
+    accuracy_test,
+    ::testing::ValuesIn(param_generator_complex(generate_lengths({pow2_range_for_stride_half}),
+                                                {fft_precision_half},
                                                 batch_range_1D,
                                                 stride_range_for_pow2,
                                                 stride_range_for_pow2,
@@ -333,7 +379,21 @@
     pow2_1D_stride_real,
     accuracy_test,
     ::testing::ValuesIn(param_generator_real(generate_lengths({pow2_range_for_stride}),
-                                             precision_range,
+                                             precision_range_sp_dp,
+                                             batch_range_1D,
+                                             stride_range_for_pow2,
+                                             stride_range_for_pow2,
+                                             ioffset_range_zero,
+                                             ooffset_range_zero,
+                                             place_range,
+                                             true)),
+    accuracy_test::TestName);
+
+INSTANTIATE_TEST_SUITE_P(
+    pow2_1D_stride_real_half,
+    accuracy_test,
+    ::testing::ValuesIn(param_generator_real(generate_lengths({pow2_range_for_stride_half}),
+                                             {fft_precision_half},
                                              batch_range_1D,
                                              stride_range_for_pow2,
                                              stride_range_for_pow2,
@@ -406,7 +466,7 @@
     pow2_1D_complex_batched_2D_strided,
     accuracy_test,
     ::testing::ValuesIn(param_generator_complex_1d_batched_2d(generate_lengths({pow2_range_2D}),
-                                                              precision_range,
+                                                              precision_range_sp_dp,
                                                               ioffset_range_zero,
                                                               ooffset_range_zero,
                                                               place_range)),
@@ -417,7 +477,7 @@
     pow3_1D_complex_batched_2D_strided,
     accuracy_test,
     ::testing::ValuesIn(param_generator_complex_1d_batched_2d(generate_lengths({pow3_range_2D}),
-                                                              precision_range,
+                                                              precision_range_sp_dp,
                                                               ioffset_range_zero,
                                                               ooffset_range_zero,
                                                               place_range)),
@@ -428,7 +488,7 @@
     pow5_1D_complex_batched_2D_strided,
     accuracy_test,
     ::testing::ValuesIn(param_generator_complex_1d_batched_2d(generate_lengths({pow5_range_2D}),
-                                                              precision_range,
+                                                              precision_range_sp_dp,
                                                               ioffset_range_zero,
                                                               ooffset_range_zero,
                                                               place_range)),
@@ -440,7 +500,7 @@
     prime_1D_complex_batched_2D_strided,
     accuracy_test,
     ::testing::ValuesIn(param_generator_complex_1d_batched_2d(generate_lengths({prime_range_2D}),
-                                                              precision_range,
+                                                              precision_range_sp_dp,
                                                               ioffset_range_zero,
                                                               ooffset_range_zero,
                                                               place_range)),
diff -Nru rocfft-5.5.0/clients/tests/accuracy_test_2D.cpp rocfft-5.7.1/clients/tests/accuracy_test_2D.cpp
--- rocfft-5.5.0/clients/tests/accuracy_test_2D.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/accuracy_test_2D.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -36,6 +36,9 @@
 const static std::vector<size_t> pow2_range
     = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192};
 
+// For the current configuration, half-precision has a fft size limit of 65536
+const static std::vector<size_t> pow2_range_half = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048};
+
 const static std::vector<size_t> pow3_range = {3, 9, 27, 81, 243, 729, 2187, 6561};
 
 const static std::vector<size_t> pow5_range = {5, 25, 125, 625, 3125, 15625};
@@ -56,7 +59,21 @@
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({pow2_range,
                                                                                pow2_range}),
-                                                             precision_range,
+                                                             precision_range_sp_dp,
+                                                             batch_range,
+                                                             stride_range,
+                                                             stride_range,
+                                                             ioffset_range_zero,
+                                                             ooffset_range_zero,
+                                                             place_range,
+                                                             true)),
+                         accuracy_test::TestName);
+
+INSTANTIATE_TEST_SUITE_P(pow2_2D_half,
+                         accuracy_test,
+                         ::testing::ValuesIn(param_generator(generate_lengths({pow2_range_half,
+                                                                               {2, 4, 8, 16, 32}}),
+                                                             {fft_precision_half},
                                                              batch_range,
                                                              stride_range,
                                                              stride_range,
@@ -65,11 +82,12 @@
                                                              place_range,
                                                              true)),
                          accuracy_test::TestName);
+
 INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_2D,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({pow2_range,
                                                                                pow2_range}),
-                                                             precision_range,
+                                                             precision_range_full,
                                                              batch_range,
                                                              stride_range,
                                                              stride_range,
@@ -83,7 +101,7 @@
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({pow3_range,
                                                                                pow3_range}),
-                                                             precision_range,
+                                                             precision_range_sp_dp,
                                                              batch_range,
                                                              stride_range,
                                                              stride_range,
@@ -92,11 +110,12 @@
                                                              place_range,
                                                              true)),
                          accuracy_test::TestName);
+
 INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow3_2D,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({pow3_range,
                                                                                pow3_range}),
-                                                             precision_range,
+                                                             precision_range_full,
                                                              batch_range,
                                                              stride_range,
                                                              stride_range,
@@ -110,7 +129,7 @@
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({pow5_range,
                                                                                pow5_range}),
-                                                             precision_range,
+                                                             precision_range_sp_dp,
                                                              batch_range,
                                                              stride_range,
                                                              stride_range,
@@ -119,11 +138,12 @@
                                                              place_range,
                                                              true)),
                          accuracy_test::TestName);
+
 INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow5_2D,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({pow5_range,
                                                                                pow5_range}),
-                                                             precision_range,
+                                                             precision_range_full,
                                                              batch_range,
                                                              stride_range,
                                                              stride_range,
@@ -137,7 +157,7 @@
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({prime_range,
                                                                                prime_range}),
-                                                             precision_range,
+                                                             precision_range_sp_dp,
                                                              batch_range,
                                                              stride_range,
                                                              stride_range,
@@ -146,11 +166,12 @@
                                                              place_range,
                                                              true)),
                          accuracy_test::TestName);
+
 INSTANTIATE_TEST_SUITE_P(DISABLED_offset_prime_2D,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({prime_range,
                                                                                prime_range}),
-                                                             precision_range,
+                                                             precision_range_sp_dp,
                                                              batch_range,
                                                              stride_range,
                                                              stride_range,
@@ -164,7 +185,7 @@
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({mix_range,
                                                                                mix_range}),
-                                                             precision_range,
+                                                             precision_range_sp_dp,
                                                              batch_range,
                                                              stride_range,
                                                              stride_range,
@@ -173,11 +194,12 @@
                                                              place_range,
                                                              true)),
                          accuracy_test::TestName);
+
 INSTANTIATE_TEST_SUITE_P(DISABLED_offset_mix_2D,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(generate_lengths({mix_range,
                                                                                mix_range}),
-                                                             precision_range,
+                                                             precision_range_full,
                                                              batch_range,
                                                              stride_range,
                                                              stride_range,
@@ -192,7 +214,7 @@
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(
                              generate_lengths({{1}, {4, 8, 8192, 3, 27, 7, 11, 5000, 8000}}),
-                             precision_range,
+                             precision_range_full,
                              batch_range,
                              stride_range,
                              stride_range,
@@ -207,7 +229,7 @@
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(
                              generate_lengths({{4, 8, 8192, 3, 27, 7, 11, 5000, 8000}, {1}}),
-                             precision_range,
+                             precision_range_full,
                              batch_range,
                              stride_range,
                              stride_range,
diff -Nru rocfft-5.5.0/clients/tests/accuracy_test_3D.cpp rocfft-5.7.1/clients/tests/accuracy_test_3D.cpp
--- rocfft-5.5.0/clients/tests/accuracy_test_3D.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/accuracy_test_3D.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -34,6 +34,9 @@
 // TODO: 512, 1024, 2048 make the tests take too long; re-enable when
 // test speed is improved.
 static std::vector<size_t> pow2_range = {4, 8, 16, 32, 128, 256};
+// For the current configuration, half-precision has a fft size limit of 65536
+static std::vector<size_t> pow2_range_half = {4, 8, 16, 32};
+
 // SBCC+SBRC as a sub-node of a 3D TRTRTR
 std::vector<std::vector<size_t>> pow2_adhoc = {{4, 4, 8192}};
 
@@ -55,7 +58,7 @@
     pow2_3D,
     accuracy_test,
     ::testing::ValuesIn(param_generator(generate_lengths({pow2_range, pow2_range, pow2_range}),
-                                        precision_range,
+                                        precision_range_sp_dp,
                                         batch_range,
                                         stride_range,
                                         stride_range,
@@ -65,11 +68,25 @@
                                         true)),
     accuracy_test::TestName);
 
+INSTANTIATE_TEST_SUITE_P(pow2_3D_half,
+                         accuracy_test,
+                         ::testing::ValuesIn(param_generator(
+                             generate_lengths({pow2_range_half, pow2_range_half, pow2_range_half}),
+                             {fft_precision_half},
+                             batch_range,
+                             stride_range,
+                             stride_range,
+                             ioffset_range_zero,
+                             ooffset_range_zero,
+                             place_range,
+                             true)),
+                         accuracy_test::TestName);
+
 INSTANTIATE_TEST_SUITE_P(
     DISABLED_offset_pow2_3D,
     accuracy_test,
     ::testing::ValuesIn(param_generator(generate_lengths({pow2_range, pow2_range, pow2_range}),
-                                        precision_range,
+                                        precision_range_full,
                                         batch_range,
                                         stride_range,
                                         stride_range,
@@ -83,7 +100,7 @@
     pow3_3D,
     accuracy_test,
     ::testing::ValuesIn(param_generator(generate_lengths({pow3_range, pow3_range, pow3_range}),
-                                        precision_range,
+                                        precision_range_sp_dp,
                                         batch_range,
                                         stride_range,
                                         stride_range,
@@ -96,7 +113,7 @@
     DISABLED_offset_pow3_3D,
     accuracy_test,
     ::testing::ValuesIn(param_generator(generate_lengths({pow3_range, pow3_range, pow3_range}),
-                                        precision_range,
+                                        precision_range_full,
                                         batch_range,
                                         stride_range,
                                         stride_range,
@@ -110,7 +127,7 @@
     pow5_3D,
     accuracy_test,
     ::testing::ValuesIn(param_generator(generate_lengths({pow5_range, pow5_range, pow5_range}),
-                                        precision_range,
+                                        precision_range_sp_dp,
                                         batch_range,
                                         stride_range,
                                         stride_range,
@@ -123,7 +140,7 @@
     DISABLED_offset_pow5_3D,
     accuracy_test,
     ::testing::ValuesIn(param_generator(generate_lengths({pow5_range, pow5_range, pow5_range}),
-                                        precision_range,
+                                        precision_range_full,
                                         batch_range,
                                         stride_range,
                                         stride_range,
@@ -137,7 +154,7 @@
     prime_3D,
     accuracy_test,
     ::testing::ValuesIn(param_generator(generate_lengths({prime_range, prime_range, prime_range}),
-                                        precision_range,
+                                        precision_range_sp_dp,
                                         batch_range,
                                         stride_range,
                                         stride_range,
@@ -150,7 +167,7 @@
     DISABLED_offset_prime_3D,
     accuracy_test,
     ::testing::ValuesIn(param_generator(generate_lengths({prime_range, prime_range, prime_range}),
-                                        precision_range,
+                                        precision_range_full,
                                         batch_range,
                                         stride_range,
                                         stride_range,
@@ -164,7 +181,7 @@
     mix_3D,
     accuracy_test,
     ::testing::ValuesIn(param_generator(generate_lengths({pow2_range, pow3_range, prime_range}),
-                                        precision_range,
+                                        precision_range_sp_dp,
                                         batch_range,
                                         stride_range,
                                         stride_range,
@@ -177,7 +194,7 @@
     DISABLED_offset_mix_3D,
     accuracy_test,
     ::testing::ValuesIn(param_generator(generate_lengths({pow2_range, pow3_range, prime_range}),
-                                        precision_range,
+                                        precision_range_full,
                                         batch_range,
                                         stride_range,
                                         stride_range,
@@ -195,7 +212,7 @@
     sbrc_3D,
     accuracy_test,
     ::testing::ValuesIn(param_generator(generate_lengths({sbrc_range, sbrc_range, sbrc_range}),
-                                        precision_range,
+                                        precision_range_sp_dp,
                                         sbrc_batch_range,
                                         stride_range,
                                         stride_range,
@@ -207,13 +224,15 @@
 
 // pick small sizes that will exercise 2D_SINGLE and a couple of sizes that won't
 static std::vector<size_t> inner_batch_3D_range       = {4, 8, 16, 32, 20, 24, 64};
+static std::vector<size_t> inner_batch_3D_range_half  = {4, 8, 16, 32, 20, 24};
 static std::vector<size_t> inner_batch_3D_batch_range = {3, 2, 1};
+
 INSTANTIATE_TEST_SUITE_P(
     inner_batch_3D,
     accuracy_test,
     ::testing::ValuesIn(param_generator(
         generate_lengths({inner_batch_3D_range, inner_batch_3D_range, inner_batch_3D_range}),
-        precision_range,
+        precision_range_sp_dp,
         inner_batch_3D_batch_range,
         stride_generator_3D_inner_batch(stride_range),
         stride_generator_3D_inner_batch(stride_range),
@@ -222,3 +241,19 @@
         place_range,
         true)),
     accuracy_test::TestName);
+
+INSTANTIATE_TEST_SUITE_P(
+    inner_batch_3D_half,
+    accuracy_test,
+    ::testing::ValuesIn(param_generator(generate_lengths({inner_batch_3D_range_half,
+                                                          inner_batch_3D_range_half,
+                                                          inner_batch_3D_range_half}),
+                                        {fft_precision_half},
+                                        inner_batch_3D_batch_range,
+                                        stride_generator_3D_inner_batch(stride_range),
+                                        stride_generator_3D_inner_batch(stride_range),
+                                        ioffset_range_zero,
+                                        ooffset_range_zero,
+                                        place_range,
+                                        true)),
+    accuracy_test::TestName);
\ No newline at end of file
diff -Nru rocfft-5.5.0/clients/tests/accuracy_test_adhoc.cpp rocfft-5.7.1/clients/tests/accuracy_test_adhoc.cpp
--- rocfft-5.5.0/clients/tests/accuracy_test_adhoc.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/accuracy_test_adhoc.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -19,8 +19,6 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#include "../fft_params.h"
-
 #include "accuracy_test.h"
 
 std::vector<std::vector<size_t>> adhoc_sizes = {
@@ -54,6 +52,9 @@
 
     // TILE_UNALIGNED type of SBRC 3D ERC
     {98, 98, 98},
+
+    // 3D_BLOCK_CR
+    {336, 336, 56},
 };
 
 const static std::vector<std::vector<size_t>> stride_range = {{1}};
@@ -67,7 +68,7 @@
 INSTANTIATE_TEST_SUITE_P(adhoc,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(adhoc_sizes,
-                                                             precision_range,
+                                                             precision_range_sp_dp,
                                                              batch_range,
                                                              stride_range,
                                                              stride_range,
@@ -80,7 +81,7 @@
 INSTANTIATE_TEST_SUITE_P(DISABLED_offset_adhoc,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator(adhoc_sizes,
-                                                             precision_range,
+                                                             precision_range_full,
                                                              batch_range,
                                                              stride_range,
                                                              stride_range,
@@ -90,23 +91,23 @@
                                                              true)),
                          accuracy_test::TestName);
 
+// Test that dist is ignored for batch-1 transforms.  Normally,
+// in-place transforms require same dist, but for batch-1 dist isn't
+// used for anything and differing dist should be allowed.
 inline auto param_permissive_iodist()
 {
     std::vector<std::vector<size_t>> lengths = adhoc_sizes;
-    // TODO- for these permissive iodist tests,
-    // some 98^3 sizes take too long for the exhaustive search buffer assignments
-    // about millions of assignments, thus the program is hung there.
-    // So we take this length out from iodist test for now.
-    lengths.erase(std::find(lengths.begin(), lengths.end(), std::vector<size_t>{98, 98, 98}));
     lengths.push_back({4});
 
     std::vector<fft_params> params;
-    for(const auto precision : precision_range)
+    for(const auto precision : precision_range_sp_dp)
     {
         for(const auto trans_type : trans_type_range)
         {
             for(const auto& types : generate_types(trans_type, place_range, true))
             {
+                if(std::get<1>(types) != fft_placement_inplace)
+                    continue;
                 for(const auto& len : lengths)
                 {
                     fft_params param;
@@ -133,14 +134,64 @@
                          ::testing::ValuesIn(param_permissive_iodist()),
                          accuracy_test::TestName);
 
-inline auto param_adhoc_stride()
+inline auto param_adhoc_colmajor()
 {
-    std::vector<std::vector<size_t>> lengths = adhoc_sizes;
-    lengths.push_back({4});
+    // generate basic FFTs of adhoc sizes
+    auto params = param_generator(adhoc_sizes,
+                                  {fft_precision_single},
+                                  {2},
+                                  stride_range,
+                                  stride_range,
+                                  ioffset_range_zero,
+                                  ooffset_range_zero,
+                                  {fft_placement_notinplace},
+                                  false);
+
+    // remove any params that are:
+    // - 1D (not enough dims to swap)
+    // - real-complex 2D (we only get to play with higher dims, so
+    //   again not enough dims to swap)
+    params.erase(std::remove_if(params.begin(),
+                                params.end(),
+                                [](const fft_params& param) {
+                                    if(param.length.size() == 1)
+                                        return true;
+                                    if(param.length.size() == 2)
+                                    {
+                                        if(param.transform_type == fft_transform_type_real_forward
+                                           || param.transform_type
+                                                  == fft_transform_type_real_inverse)
+                                            return true;
+                                    }
+                                    return false;
+                                }),
+                 params.end());
+
+    // reverse length/stride order on remaining params to make them
+    // col-major
+    std::for_each(params.begin(), params.end(), [](fft_params& param) {
+        size_t start_dim = 0;
+        // for real-complex we can't touch the fastest dim
+        if(param.transform_type == fft_transform_type_real_forward
+           || param.transform_type == fft_transform_type_real_inverse)
+            ++start_dim;
+        std::reverse(param.length.rbegin() + start_dim, param.length.rend());
+        std::reverse(param.istride.rbegin() + start_dim, param.istride.rend());
+        std::reverse(param.ostride.rbegin() + start_dim, param.ostride.rend());
+    });
+    return params;
+}
 
+INSTANTIATE_TEST_SUITE_P(adhoc_colmajor,
+                         accuracy_test,
+                         ::testing::ValuesIn(param_adhoc_colmajor()),
+                         accuracy_test::TestName);
+
+inline auto param_adhoc_stride()
+{
     std::vector<fft_params> params;
 
-    for(const auto precision : precision_range)
+    for(const auto precision : precision_range_full)
     {
         for(const auto& types : generate_types(fft_transform_type_complex_forward,
                                                {fft_placement_inplace, fft_placement_notinplace},
@@ -164,38 +215,35 @@
 
         // test C2R/R2C with non-contiguous higher strides and dist - we
         // want unit stride for length0 so we do the even-length optimization
-        for(const auto trans_type :
-            {fft_transform_type_real_forward, fft_transform_type_real_inverse})
+        for(const auto& types :
+            generate_types(fft_transform_type_real_forward, {fft_placement_notinplace}, true))
         {
-            for(const auto& types : generate_types(trans_type, {fft_placement_notinplace}, true))
-            {
-                fft_params param;
-                param.length         = {4, 4, 4};
-                param.precision      = precision;
-                param.idist          = 0;
-                param.odist          = 0;
-                param.transform_type = trans_type;
-                param.nbatch         = 2;
-                param.placement      = std::get<1>(types);
-                param.itype          = std::get<2>(types);
-                param.otype          = std::get<3>(types);
-                param.istride        = {16, 4, 1};
-                param.ostride        = {16, 4, 1};
-                params.push_back(param);
-
-                param.length         = {2, 2, 2};
-                param.precision      = precision;
-                param.idist          = 0;
-                param.odist          = 0;
-                param.transform_type = trans_type;
-                param.nbatch         = 2;
-                param.placement      = std::get<1>(types);
-                param.itype          = std::get<2>(types);
-                param.otype          = std::get<3>(types);
-                param.istride        = {20, 6, 1};
-                param.ostride        = {20, 6, 1};
-                params.push_back(param);
-            }
+            fft_params param;
+            param.length         = {4, 4, 4};
+            param.precision      = precision;
+            param.idist          = 0;
+            param.odist          = 0;
+            param.transform_type = fft_transform_type_real_forward;
+            param.nbatch         = 2;
+            param.placement      = std::get<1>(types);
+            param.itype          = std::get<2>(types);
+            param.otype          = std::get<3>(types);
+            param.istride        = {16, 4, 1};
+            param.ostride        = {16, 4, 1};
+            params.push_back(param);
+
+            param.length         = {2, 2, 2};
+            param.precision      = precision;
+            param.idist          = 0;
+            param.odist          = 0;
+            param.transform_type = fft_transform_type_real_forward;
+            param.nbatch         = 2;
+            param.placement      = std::get<1>(types);
+            param.itype          = std::get<2>(types);
+            param.otype          = std::get<3>(types);
+            param.istride        = {20, 6, 1};
+            param.ostride        = {20, 6, 1};
+            params.push_back(param);
         }
     }
 
diff -Nru rocfft-5.5.0/clients/tests/accuracy_test_callback.cpp rocfft-5.7.1/clients/tests/accuracy_test_callback.cpp
--- rocfft-5.5.0/clients/tests/accuracy_test_callback.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/accuracy_test_callback.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -18,8 +18,6 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#include "../rocfft_params.h"
-
 #include "accuracy_test.h"
 
 std::vector<std::vector<size_t>> callback_sizes = {
@@ -102,16 +100,14 @@
 const static std::vector<std::vector<size_t>> ioffset_range = {{0, 0}, {1, 1}};
 const static std::vector<std::vector<size_t>> ooffset_range = {{0, 0}, {1, 1}};
 
-auto transform_types = {fft_transform_type_complex_forward,
-                        fft_transform_type_complex_inverse,
-                        fft_transform_type_real_forward,
-                        fft_transform_type_real_inverse};
+auto forward_transform_types
+    = {fft_transform_type_complex_forward, fft_transform_type_real_forward};
 
 INSTANTIATE_TEST_SUITE_P(callback,
                          accuracy_test,
-                         ::testing::ValuesIn(param_generator_base(transform_types,
+                         ::testing::ValuesIn(param_generator_base(forward_transform_types,
                                                                   callback_sizes,
-                                                                  precision_range,
+                                                                  precision_range_sp_dp,
                                                                   batch_range,
                                                                   generate_types,
                                                                   stride_range,
@@ -125,9 +121,9 @@
 
 INSTANTIATE_TEST_SUITE_P(DISABLED_callback,
                          accuracy_test,
-                         ::testing::ValuesIn(param_generator_base(transform_types,
+                         ::testing::ValuesIn(param_generator_base(forward_transform_types,
                                                                   callback_sizes,
-                                                                  precision_range,
+                                                                  precision_range_sp_dp,
                                                                   batch_range,
                                                                   generate_types,
                                                                   stride_range,
@@ -145,7 +141,7 @@
 inline auto param_generator_scaling(const std::vector<std::vector<size_t>>& v_lengths)
 {
     auto params = param_generator(callback_sizes,
-                                  precision_range,
+                                  precision_range_sp_dp,
                                   batch_range,
                                   stride_range,
                                   stride_range,
diff -Nru rocfft-5.5.0/clients/tests/accuracy_test_checkstride.cpp rocfft-5.7.1/clients/tests/accuracy_test_checkstride.cpp
--- rocfft-5.5.0/clients/tests/accuracy_test_checkstride.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/accuracy_test_checkstride.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -18,8 +18,6 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#include "../fft_params.h"
-
 #include "accuracy_test.h"
 
 inline auto param_checkstride()
@@ -59,7 +57,7 @@
     {
         for(const auto& s : sizes)
         {
-            for(const auto precision : precision_range)
+            for(const auto precision : precision_range_sp_dp)
             {
                 for(const auto& types :
                     generate_types(trans_type, {fft_placement_notinplace}, true))
diff -Nru rocfft-5.5.0/clients/tests/cmake/FindFFTW.cmake rocfft-5.7.1/clients/tests/cmake/FindFFTW.cmake
--- rocfft-5.5.0/clients/tests/cmake/FindFFTW.cmake	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/cmake/FindFFTW.cmake	2023-08-09 16:19:51.000000000 +0000
@@ -40,6 +40,8 @@
 # message( STATUS "FFTW_FIND_REQUIRED_FLOAT: ${FFTW_FIND_REQUIRED_FLOAT}" )
 # message( STATUS "FFTW_FIND_REQUIRED_DOUBLE: ${FFTW_FIND_REQUIRED_DOUBLE}" )
 
+include( CheckSymbolExists )
+
 set( FFTW_LIBRARIES "" )
 if( FFTW_FIND_REQUIRED_FLOAT OR FFTW_FIND_REQUIRED_SINGLE )
   find_library( FFTW_LIBRARIES_SINGLE
@@ -68,6 +70,9 @@
     list( APPEND FFTW_LIBRARIES ${FFTWF_THREADS_LIBRARY} )
     set( FFTW_MULTITHREAD TRUE )
   endif()
+
+  list( APPEND CMAKE_REQUIRED_LIBRARIES ${FFTW_LIBRARIES_SINGLE} )
+  check_symbol_exists( fftwf_sprint_plan "fftw3.h" FFTW_HAVE_SPRINT_PLAN )
 endif( )
 
 if( FFTW_FIND_REQUIRED_DOUBLE )
@@ -97,8 +102,15 @@
     list( APPEND FFTW_LIBRARIES ${FFTW_THREADS_LIBRARY} )
     set( FFTW_MULTITHREAD TRUE )
   endif()
+
+  list( APPEND CMAKE_REQUIRED_LIBRARIES ${FFTW_LIBRARIES_DOUBLE} )
+  check_symbol_exists( fftw_sprint_plan "fftw3.h" FFTW_HAVE_SPRINT_PLAN )
 endif( )
 
+if( BUILD_FFTW OR FFTW_HAVE_SPRINT_PLAN )
+  target_compile_definitions( rocfft-test PUBLIC FFTW_HAVE_SPRINT_PLAN )
+endif()
+
 include( FindPackageHandleStandardArgs )
 FIND_PACKAGE_HANDLE_STANDARD_ARGS( FFTW
     REQUIRED_VARS FFTW_INCLUDE_DIRS FFTW_LIBRARIES )
diff -Nru rocfft-5.5.0/clients/tests/default_callbacks_test.cpp rocfft-5.7.1/clients/tests/default_callbacks_test.cpp
--- rocfft-5.5.0/clients/tests/default_callbacks_test.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/default_callbacks_test.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -26,7 +26,7 @@
 
 #include <hip/hip_runtime.h>
 
-#include "../rocfft_params.h"
+#include "../../shared/rocfft_params.h"
 #include "fftw_transform.h"
 #include "rocfft.h"
 
@@ -40,10 +40,10 @@
     return data[offset];
 }
 
-__device__ auto load_cb_double2 = load_cb<double2>;
-__device__ auto load_cb_double  = load_cb<double>;
-__device__ auto load_cb_float2  = load_cb<float2>;
-__device__ auto load_cb_float   = load_cb<float>;
+__device__ auto load_cb_complex_double = load_cb<rocfft_complex<double>>;
+__device__ auto load_cb_double         = load_cb<double>;
+__device__ auto load_cb_complex_float  = load_cb<rocfft_complex<float>>;
+__device__ auto load_cb_float          = load_cb<float>;
 
 // -------------------------------------
 // default store callback definitions
@@ -55,10 +55,10 @@
     data[offset] = element;
 }
 
-__device__ auto store_cb_double2 = store_cb<double2>;
-__device__ auto store_cb_double  = store_cb<double>;
-__device__ auto store_cb_float2  = store_cb<float2>;
-__device__ auto store_cb_float   = store_cb<float>;
+__device__ auto store_cb_complex_double = store_cb<rocfft_complex<double>>;
+__device__ auto store_cb_double         = store_cb<double>;
+__device__ auto store_cb_complex_float  = store_cb<rocfft_complex<float>>;
+__device__ auto store_cb_float          = store_cb<float>;
 
 // -------------------------------------
 // type traits definitions
@@ -71,13 +71,13 @@
 };
 
 template <>
-struct is_hip_complex<float2>
+struct is_hip_complex<rocfft_complex<float>>
 {
     static const bool value = true;
 };
 
 template <>
-struct is_hip_complex<double2>
+struct is_hip_complex<rocfft_complex<double>>
 {
     static const bool value = true;
 };
@@ -110,15 +110,15 @@
         float  low_bound_f = -1.0f, up_bound_f = 1.0f;
         double low_bound_d = -1.0, up_bound_d = 1.0;
 
-        std::vector<float2>  h_mem_out_f2, h_mem_out_no_cb_f2;
-        std::vector<double2> h_mem_out_d2, h_mem_out_no_cb_d2;
+        std::vector<rocfft_complex<float>>  h_mem_out_f2, h_mem_out_no_cb_f2;
+        std::vector<rocfft_complex<double>> h_mem_out_d2, h_mem_out_no_cb_d2;
 
         switch(fwrd_transf_type)
         {
         case rocfft_transform_type_complex_forward:
         {
-            std::vector<float2>  h_mem_in_f2;
-            std::vector<double2> h_mem_in_d2;
+            std::vector<rocfft_complex<float>>  h_mem_in_f2;
+            std::vector<rocfft_complex<double>> h_mem_in_d2;
 
             (frwd_transf_precision == rocfft_precision_single)
                 ? run(low_bound_f, up_bound_f, h_mem_in_f2, h_mem_out_f2, h_mem_out_no_cb_f2)
@@ -293,18 +293,16 @@
     void validate_test(const std::vector<Tout>& host_mem_out,
                        const std::vector<Tout>& host_mem_out_no_cb)
     {
-        std::vector<std::pair<size_t, size_t>> linf_failures;
-
         auto diff = distance_1to1_complex(
-            reinterpret_cast<const std::complex<Tbound>*>(host_mem_out.data()),
-            reinterpret_cast<const std::complex<Tbound>*>(host_mem_out_no_cb.data()),
+            reinterpret_cast<const rocfft_complex<Tbound>*>(host_mem_out.data()),
+            reinterpret_cast<const rocfft_complex<Tbound>*>(host_mem_out_no_cb.data()),
             host_mem_out.size(),
             1,
             1,
             host_mem_out.size(),
             1,
             host_mem_out_no_cb.size(),
-            linf_failures,
+            nullptr,
             type_epsilon<Tbound>(),
             {0},
             {0});
@@ -320,10 +318,11 @@
     void set_load_callback(){};
 
     template <>
-    void set_load_callback<double2>()
+    void set_load_callback<rocfft_complex<double>>()
     {
-        EXPECT_EQ(hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_double2), sizeof(void*)),
-                  hipSuccess);
+        EXPECT_EQ(
+            hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_complex_double), sizeof(void*)),
+            hipSuccess);
     };
 
     template <>
@@ -334,10 +333,11 @@
     };
 
     template <>
-    void set_load_callback<float2>()
+    void set_load_callback<rocfft_complex<float>>()
     {
-        EXPECT_EQ(hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_float2), sizeof(void*)),
-                  hipSuccess);
+        EXPECT_EQ(
+            hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_complex_float), sizeof(void*)),
+            hipSuccess);
     };
 
     template <>
@@ -355,10 +355,11 @@
     void set_store_callback(){};
 
     template <>
-    void set_store_callback<double2>()
+    void set_store_callback<rocfft_complex<double>>()
     {
-        EXPECT_EQ(hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_double2), sizeof(void*)),
-                  hipSuccess);
+        EXPECT_EQ(
+            hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_complex_double), sizeof(void*)),
+            hipSuccess);
     };
 
     template <>
@@ -369,10 +370,11 @@
     };
 
     template <>
-    void set_store_callback<float2>()
+    void set_store_callback<rocfft_complex<float>>()
     {
-        EXPECT_EQ(hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_float2), sizeof(void*)),
-                  hipSuccess);
+        EXPECT_EQ(
+            hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_complex_float), sizeof(void*)),
+            hipSuccess);
     };
 
     template <>
diff -Nru rocfft-5.5.0/clients/tests/fftw_transform.h rocfft-5.7.1/clients/tests/fftw_transform.h
--- rocfft-5.5.0/clients/tests/fftw_transform.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/fftw_transform.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -23,7 +23,6 @@
 #define FFTWTRANSFORM_H
 
 #include "test_params.h"
-#include <complex>
 #include <fftw3.h>
 #include <vector>
 
@@ -44,6 +43,11 @@
 template <typename Tfloat>
 inline double type_epsilon();
 template <>
+inline double type_epsilon<_Float16>()
+{
+    return half_epsilon;
+}
+template <>
 inline double type_epsilon<float>()
 {
     return single_epsilon;
@@ -61,6 +65,13 @@
 template <typename Tfloat>
 struct fftw_trait;
 template <>
+struct fftw_trait<_Float16>
+{
+    // fftw does not support half precision, so use single precision and convert
+    using fftw_complex_type = fftwf_complex;
+    using fftw_plan_type    = fftwf_plan;
+};
+template <>
 struct fftw_trait<float>
 {
     using fftw_complex_type = fftwf_complex;
@@ -73,6 +84,36 @@
     using fftw_plan_type    = fftw_plan;
 };
 
+// Copies the half-precision input buffer to a single-precision
+// buffer.  Note that the input buffer is already sized like it's a
+// single-precision buffer (but only half of it is filled), because
+// we allocate a single-precision buffer for FFTW to plan with.
+static hostbuf half_to_single_copy(const hostbuf& in)
+{
+    auto out      = in.copy();
+    auto in_begin = reinterpret_cast<const _Float16*>(in.data());
+    std::copy_n(in_begin, in.size() / sizeof(_Float16) / 2, reinterpret_cast<float*>(out.data()));
+    return out;
+}
+
+// converts a wider precision buffer to a narrower precision, in-place
+template <typename TfloatIn, typename TfloatOut>
+void narrow_precision_inplace(hostbuf& in)
+{
+    // ensure we're actually shrinking the data
+    static_assert(sizeof(TfloatIn) > sizeof(TfloatOut));
+
+    auto readPtr  = reinterpret_cast<const TfloatIn*>(in.data());
+    auto writePtr = reinterpret_cast<TfloatOut*>(in.data());
+    std::copy_n(readPtr, in.size() / sizeof(TfloatIn), writePtr);
+    in.shrink(in.size() / (sizeof(TfloatIn) / sizeof(TfloatOut)));
+}
+
+static void single_to_half_inplace(hostbuf& in)
+{
+    narrow_precision_inplace<float, _Float16>(in);
+}
+
 // Template wrappers for real-valued FFTW allocators:
 template <typename Tfloat>
 inline Tfloat* fftw_alloc_real_type(size_t n);
@@ -124,14 +165,14 @@
     return fftw_alloc_complex_type<double>(n);
 }
 template <>
-inline std::complex<float>* fftw_alloc_type<std::complex<float>>(size_t n)
+inline rocfft_complex<float>* fftw_alloc_type<rocfft_complex<float>>(size_t n)
 {
-    return (std::complex<float>*)fftw_alloc_complex_type<float>(n);
+    return (rocfft_complex<float>*)fftw_alloc_complex_type<float>(n);
 }
 template <>
-inline std::complex<double>* fftw_alloc_type<std::complex<double>>(size_t n)
+inline rocfft_complex<double>* fftw_alloc_type<rocfft_complex<double>>(size_t n)
 {
-    return (std::complex<double>*)fftw_alloc_complex_type<double>(n);
+    return (rocfft_complex<double>*)fftw_alloc_complex_type<double>(n);
 }
 
 // Template wrappers for FFTW plan executors:
@@ -175,6 +216,20 @@
                          unsigned                                        flags);
 
 template <>
+inline typename fftw_trait<_Float16>::fftw_plan_type
+    fftw_plan_guru64_dft<_Float16>(int                                               rank,
+                                   const fftw_iodim64*                               dims,
+                                   int                                               howmany_rank,
+                                   const fftw_iodim64*                               howmany_dims,
+                                   typename fftw_trait<_Float16>::fftw_complex_type* in,
+                                   typename fftw_trait<_Float16>::fftw_complex_type* out,
+                                   int                                               sign,
+                                   unsigned                                          flags)
+{
+    return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags);
+}
+
+template <>
 inline typename fftw_trait<float>::fftw_plan_type
     fftw_plan_guru64_dft<float>(int                                            rank,
                                 const fftw_iodim64*                            dims,
@@ -204,22 +259,42 @@
 
 // Template wrappers for FFTW c2c executors:
 template <typename Tfloat>
-inline void fftw_plan_execute_c2c(typename fftw_trait<Tfloat>::fftw_plan_type     plan,
-                                  typename fftw_trait<Tfloat>::fftw_complex_type* in,
-                                  typename fftw_trait<Tfloat>::fftw_complex_type* out);
+inline void fftw_plan_execute_c2c(typename fftw_trait<Tfloat>::fftw_plan_type plan,
+                                  std::vector<hostbuf>&                       in,
+                                  std::vector<hostbuf>&                       out);
+
+template <>
+inline void fftw_plan_execute_c2c<_Float16>(typename fftw_trait<_Float16>::fftw_plan_type plan,
+                                            std::vector<hostbuf>&                         in,
+                                            std::vector<hostbuf>&                         out)
+{
+    // since FFTW does not natively support half precision, convert
+    // input to single, execute, then convert output back to half
+    auto in_single = half_to_single_copy(in.front());
+    fftwf_execute_dft(plan,
+                      reinterpret_cast<fftwf_complex*>(in_single.data()),
+                      reinterpret_cast<fftwf_complex*>(out.front().data()));
+    single_to_half_inplace(out.front());
+}
+
 template <>
-inline void fftw_plan_execute_c2c<float>(typename fftw_trait<float>::fftw_plan_type     plan,
-                                         typename fftw_trait<float>::fftw_complex_type* in,
-                                         typename fftw_trait<float>::fftw_complex_type* out)
+inline void fftw_plan_execute_c2c<float>(typename fftw_trait<float>::fftw_plan_type plan,
+                                         std::vector<hostbuf>&                      in,
+                                         std::vector<hostbuf>&                      out)
 {
-    fftwf_execute_dft(plan, in, out);
+    fftwf_execute_dft(plan,
+                      reinterpret_cast<fftwf_complex*>(in.front().data()),
+                      reinterpret_cast<fftwf_complex*>(out.front().data()));
 }
+
 template <>
-inline void fftw_plan_execute_c2c<double>(typename fftw_trait<double>::fftw_plan_type     plan,
-                                          typename fftw_trait<double>::fftw_complex_type* in,
-                                          typename fftw_trait<double>::fftw_complex_type* out)
+inline void fftw_plan_execute_c2c<double>(typename fftw_trait<double>::fftw_plan_type plan,
+                                          std::vector<hostbuf>&                       in,
+                                          std::vector<hostbuf>&                       out)
 {
-    fftw_execute_dft(plan, in, out);
+    fftw_execute_dft(plan,
+                     reinterpret_cast<fftw_complex*>(in.front().data()),
+                     reinterpret_cast<fftw_complex*>(out.front().data()));
 }
 
 // Template wrappers for FFTW r2c planners:
@@ -233,6 +308,19 @@
                          typename fftw_trait<Tfloat>::fftw_complex_type* out,
                          unsigned                                        flags);
 template <>
+inline typename fftw_trait<_Float16>::fftw_plan_type
+    fftw_plan_guru64_r2c<_Float16>(int                                               rank,
+                                   const fftw_iodim64*                               dims,
+                                   int                                               howmany_rank,
+                                   const fftw_iodim64*                               howmany_dims,
+                                   _Float16*                                         in,
+                                   typename fftw_trait<_Float16>::fftw_complex_type* out,
+                                   unsigned                                          flags)
+{
+    return fftwf_plan_guru64_dft_r2c(
+        rank, dims, howmany_rank, howmany_dims, reinterpret_cast<float*>(in), out, flags);
+}
+template <>
 inline typename fftw_trait<float>::fftw_plan_type
     fftw_plan_guru64_r2c<float>(int                                            rank,
                                 const fftw_iodim64*                            dims,
@@ -259,22 +347,39 @@
 
 // Template wrappers for FFTW r2c executors:
 template <typename Tfloat>
-inline void fftw_plan_execute_r2c(typename fftw_trait<Tfloat>::fftw_plan_type     plan,
-                                  Tfloat*                                         in,
-                                  typename fftw_trait<Tfloat>::fftw_complex_type* out);
-template <>
-inline void fftw_plan_execute_r2c<float>(typename fftw_trait<float>::fftw_plan_type     plan,
-                                         float*                                         in,
-                                         typename fftw_trait<float>::fftw_complex_type* out)
-{
-    fftwf_execute_dft_r2c(plan, in, out);
-}
-template <>
-inline void fftw_plan_execute_r2c<double>(typename fftw_trait<double>::fftw_plan_type     plan,
-                                          double*                                         in,
-                                          typename fftw_trait<double>::fftw_complex_type* out)
-{
-    fftw_execute_dft_r2c(plan, in, out);
+inline void fftw_plan_execute_r2c(typename fftw_trait<Tfloat>::fftw_plan_type plan,
+                                  std::vector<hostbuf>&                       in,
+                                  std::vector<hostbuf>&                       out);
+template <>
+inline void fftw_plan_execute_r2c<_Float16>(typename fftw_trait<float>::fftw_plan_type plan,
+                                            std::vector<hostbuf>&                      in,
+                                            std::vector<hostbuf>&                      out)
+{
+    // since FFTW does not natively support half precision, convert
+    // input to single, execute, then convert output back to half
+    auto in_single = half_to_single_copy(in.front());
+    fftwf_execute_dft_r2c(plan,
+                          reinterpret_cast<float*>(in_single.data()),
+                          reinterpret_cast<fftwf_complex*>(out.front().data()));
+    single_to_half_inplace(out.front());
+}
+template <>
+inline void fftw_plan_execute_r2c<float>(typename fftw_trait<float>::fftw_plan_type plan,
+                                         std::vector<hostbuf>&                      in,
+                                         std::vector<hostbuf>&                      out)
+{
+    fftwf_execute_dft_r2c(plan,
+                          reinterpret_cast<float*>(in.front().data()),
+                          reinterpret_cast<fftwf_complex*>(out.front().data()));
+}
+template <>
+inline void fftw_plan_execute_r2c<double>(typename fftw_trait<double>::fftw_plan_type plan,
+                                          std::vector<hostbuf>&                       in,
+                                          std::vector<hostbuf>&                       out)
+{
+    fftw_execute_dft_r2c(plan,
+                         reinterpret_cast<double*>(in.front().data()),
+                         reinterpret_cast<fftw_complex*>(out.front().data()));
 }
 
 // Template wrappers for FFTW c2r planners:
@@ -288,6 +393,19 @@
                          Tfloat*                                         out,
                          unsigned                                        flags);
 template <>
+inline typename fftw_trait<_Float16>::fftw_plan_type
+    fftw_plan_guru64_c2r<_Float16>(int                                               rank,
+                                   const fftw_iodim64*                               dims,
+                                   int                                               howmany_rank,
+                                   const fftw_iodim64*                               howmany_dims,
+                                   typename fftw_trait<_Float16>::fftw_complex_type* in,
+                                   _Float16*                                         out,
+                                   unsigned                                          flags)
+{
+    return fftwf_plan_guru64_dft_c2r(
+        rank, dims, howmany_rank, howmany_dims, in, reinterpret_cast<float*>(out), flags);
+}
+template <>
 inline typename fftw_trait<float>::fftw_plan_type
     fftw_plan_guru64_c2r<float>(int                                            rank,
                                 const fftw_iodim64*                            dims,
@@ -314,56 +432,60 @@
 
 // Template wrappers for FFTW c2r executors:
 template <typename Tfloat>
-inline void fftw_plan_execute_c2r(typename fftw_trait<Tfloat>::fftw_plan_type     plan,
-                                  typename fftw_trait<Tfloat>::fftw_complex_type* in,
-                                  Tfloat*                                         out);
+inline void fftw_plan_execute_c2r(typename fftw_trait<Tfloat>::fftw_plan_type plan,
+                                  std::vector<hostbuf>&                       in,
+                                  std::vector<hostbuf>&                       out);
 template <>
-inline void fftw_plan_execute_c2r<float>(typename fftw_trait<float>::fftw_plan_type     plan,
-                                         typename fftw_trait<float>::fftw_complex_type* in,
-                                         float*                                         out)
+inline void fftw_plan_execute_c2r<_Float16>(typename fftw_trait<float>::fftw_plan_type plan,
+                                            std::vector<hostbuf>&                      in,
+                                            std::vector<hostbuf>&                      out)
 {
-    fftwf_execute_dft_c2r(plan, in, out);
+    // since FFTW does not natively support half precision, convert
+    // input to single, execute, then convert output back to half
+    auto in_single = half_to_single_copy(in.front());
+    fftwf_execute_dft_c2r(plan,
+                          reinterpret_cast<fftwf_complex*>(in_single.data()),
+                          reinterpret_cast<float*>(out.front().data()));
+    single_to_half_inplace(out.front());
 }
 template <>
-inline void fftw_plan_execute_c2r<double>(typename fftw_trait<double>::fftw_plan_type     plan,
-                                          typename fftw_trait<double>::fftw_complex_type* in,
-                                          double*                                         out)
+inline void fftw_plan_execute_c2r<float>(typename fftw_trait<float>::fftw_plan_type plan,
+                                         std::vector<hostbuf>&                      in,
+                                         std::vector<hostbuf>&                      out)
 {
-    fftw_execute_dft_c2r(plan, in, out);
+    fftwf_execute_dft_c2r(plan,
+                          reinterpret_cast<fftwf_complex*>(in.front().data()),
+                          reinterpret_cast<float*>(out.front().data()));
 }
-
-// Allocator / deallocator for FFTW arrays.
-template <typename Tdata>
-struct fftwAllocator
+template <>
+inline void fftw_plan_execute_c2r<double>(typename fftw_trait<double>::fftw_plan_type plan,
+                                          std::vector<hostbuf>&                       in,
+                                          std::vector<hostbuf>&                       out)
 {
-    using value_type = Tdata;
-
-    fftwAllocator() = default;
-    template <class U>
-    fftwAllocator(const fftwAllocator<U>&)
-    {
-    }
-
-    Tdata* allocate(size_t n)
-    {
-        return (Tdata*)fftw_malloc(sizeof(Tdata) * n);
-    }
-    void deallocate(Tdata* data, size_t n)
-    {
-        fftw_free(data);
-    }
-};
+    fftw_execute_dft_c2r(plan,
+                         reinterpret_cast<fftw_complex*>(in.front().data()),
+                         reinterpret_cast<double*>(out.front().data()));
+}
 
-template <typename Tdata1, typename Tdata2>
-inline bool operator==(const fftwAllocator<Tdata1>&, const fftwAllocator<Tdata2>&)
+#ifdef FFTW_HAVE_SPRINT_PLAN
+// Template wrappers for FFTW print plan:
+template <typename Tfloat>
+inline char* fftw_sprint_plan(const typename fftw_trait<Tfloat>::fftw_plan_type plan);
+template <>
+inline char* fftw_sprint_plan<_Float16>(const typename fftw_trait<_Float16>::fftw_plan_type plan)
 {
-    return true;
+    return fftwf_sprint_plan(plan);
 }
-
-template <typename Tdata1, typename Tdata2>
-inline bool operator!=(const fftwAllocator<Tdata1>& a, const fftwAllocator<Tdata2>& b)
+template <>
+inline char* fftw_sprint_plan<float>(const typename fftw_trait<float>::fftw_plan_type plan)
+{
+    return fftwf_sprint_plan(plan);
+}
+template <>
+inline char* fftw_sprint_plan<double>(const typename fftw_trait<double>::fftw_plan_type plan)
 {
-    return !(a == b);
+    return fftw_sprint_plan(plan);
 }
+#endif
 
 #endif
diff -Nru rocfft-5.5.0/clients/tests/gtest_main.cpp rocfft-5.7.1/clients/tests/gtest_main.cpp
--- rocfft-5.5.0/clients/tests/gtest_main.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/gtest_main.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -34,7 +34,6 @@
 #include "../../shared/concurrency.h"
 #include "../../shared/environment.h"
 #include "../../shared/work_queue.h"
-#include "../rocfft_params.h"
 #include "rocfft.h"
 #include "rocfft_accuracy_test.h"
 #include "test_params.h"
@@ -53,14 +52,27 @@
 
 // User-defined random seed
 size_t random_seed;
+// Probability of running individual planar FFTs
+double planar_prob;
+// Probability of running individual callback FFTs
+double callback_prob;
 
 // Transform parameters for manual test:
 fft_params manual_params;
 
-// Ram limitation for tests (GiB):
+// Host memory limitation for tests (GiB):
 size_t ramgb;
 
+// Device memory limitation for tests (GiB):
+size_t vramgb;
+
+// Allow skipping tests if there is a runtime error
+bool skip_runtime_fails;
+// But count the number of failures
+int n_hip_failures = 0;
+
 // Manually specified precision cutoffs:
+double half_epsilon;
 double single_epsilon;
 double double_epsilon;
 
@@ -69,6 +81,8 @@
 double max_l2_eps_double   = 0.0;
 double max_linf_eps_single = 0.0;
 double max_l2_eps_single   = 0.0;
+double max_linf_eps_half   = 0.0;
+double max_l2_eps_half     = 0.0;
 
 // Control whether we use FFTW's wisdom (which we use to imply FFTW_MEASURE).
 bool use_fftw_wisdom = false;
@@ -76,27 +90,31 @@
 // Cache the last cpu fft that was requested
 last_cpu_fft_cache last_cpu_fft_data;
 
-static size_t get_system_memory_GiB()
+system_memory get_system_memory()
 {
-    // system memory often has a little chunk carved out for other
-    // stuff, so round up to nearest GiB.
+    system_memory memory_data;
 #ifdef WIN32
     MEMORYSTATUSEX info;
     info.dwLength = sizeof(info);
     if(!GlobalMemoryStatusEx(&info))
-        return 0;
-    return (info.ullTotalPhys + ONE_GiB - 1) / ONE_GiB;
+        return memory_data;
+    memory_data.total_bytes = info.ullTotalPhys;
+    memory_data.free_bytes  = info.ullAvailPhys;
 #else
     struct sysinfo info;
     if(sysinfo(&info) != 0)
-        return 0;
-    return (info.totalram * info.mem_unit + ONE_GiB - 1) / ONE_GiB;
+        return memory_data;
+    memory_data.total_bytes = info.totalram * info.mem_unit;
+    memory_data.free_bytes  = info.freeram * info.mem_unit;
 #endif
+    return memory_data;
 }
 
+system_memory start_memory = get_system_memory();
+
 void precompile_test_kernels(const std::string& precompile_file)
 {
-    std::cout << "precompiling test kernels..." << std::endl;
+    std::cout << "precompiling test kernels...\n";
     WorkQueue<std::string> tokenQueue;
 
     std::vector<std::string> tokens;
@@ -137,7 +155,7 @@
     std::mt19937       dist(dev());
     std::shuffle(tokens.begin(), tokens.end(), dist);
     auto precompile_begin = std::chrono::steady_clock::now();
-    std::cout << "precompiling " << tokens.size() << " FFT plans..." << std::endl;
+    std::cout << "precompiling " << tokens.size() << " FFT plans...\n";
 
     for(auto&& t : tokens)
         tokenQueue.push(std::move(t));
@@ -153,10 +171,30 @@
                 std::string token{tokenQueue.pop()};
                 if(token.empty())
                     break;
-                rocfft_params params;
-                params.from_token(token);
-                params.validate();
-                params.setup_structs();
+
+                try
+                {
+                    rocfft_params params_forward;
+                    params_forward.from_token(token);
+                    params_forward.validate();
+                    params_forward.setup_structs();
+
+                    params_forward.free();
+
+                    rocfft_params params_inverse;
+                    params_inverse.inverse_from_forward(params_forward);
+                    params_inverse.validate();
+                    params_inverse.setup_structs();
+                }
+                catch(std::exception& e)
+                {
+                    // failed to create a plan, abort
+                    //
+                    // we could continue on, but the test should just
+                    // fail later anyway in the same way.  so report
+                    // which token failed early and get out
+                    throw std::runtime_error(token + " plan creation failure: " + e.what());
+                }
             }
         });
         // insert empty tokens to tell threads to stop
@@ -168,25 +206,12 @@
     auto                                      precompile_end = std::chrono::steady_clock::now();
     std::chrono::duration<double, std::milli> precompile_ms  = precompile_end - precompile_begin;
     std::cout << "done precompiling FFT plans in " << static_cast<size_t>(precompile_ms.count())
-              << " ms" << std::endl;
+              << " ms\n";
 }
 
 int main(int argc, char* argv[])
 {
-    // NB: If we initialize gtest first, then it removes all of its own command-line
-    // arguments and sets argc and argv correctly; no need to jump through hoops for
-    // boost::program_options.
-    ::testing::InitGoogleTest(&argc, argv);
-
-    // Filename for fftw and fftwf wisdom.
-    std::string fftw_wisdom_filename;
-
-    // Token string to fully specify fft params for the manual test.
-    std::string test_token;
-
-    // Filename for precompiled kernels to be written to
-    std::string precompile_file;
-
+    // We would like to parse a few arguments before initiating gtest.
     po::options_description opdesc(
         "\n"
         "rocFFT Runtime Test command line options\n"
@@ -206,13 +231,46 @@
         "      HP - hermitian planar\n"
         "\n"
         "Usage");
+    // clang-format off
+    opdesc.add_options()
+        ("verbose,v",
+         po::value<int>()->default_value(0),
+         "print out detailed information for the tests.")
+        ("seed", po::value<size_t>(&random_seed),
+         "Random seed; if unset, use an actual random seed.")
+        ("planar_prob", po::value<double>(&planar_prob)->default_value(0.1),
+        "Probability of running individual planar transforms")
+        ("callback_prob", po::value<double>(&callback_prob)->default_value(0.1),
+         "Probability of running individual callback transforms");
+    // clang-format on
+    po::variables_map vm;
+    po::store(po::command_line_parser(argc, argv).options(opdesc).allow_unregistered().run(), vm);
+    po::notify(vm);
+
+    verbose = vm["verbose"].as<int>();
+
+    // NB: If we initialize gtest first, then it removes all of its own command-line
+    // arguments and sets argc and argv correctly; no need to jump through hoops for
+    // boost::program_options.
+    ::testing::InitGoogleTest(&argc, argv);
+
+    // Filename for fftw and fftwf wisdom.
+    std::string fftw_wisdom_filename;
+
+    // Token string to fully specify fft params for the manual test.
+    std::string test_token;
+
+    // Filename for precompiled kernels to be written to
+    std::string precompile_file;
+
     // Declare the supported options.
     // clang-format doesn't handle boost program options very well:
     // clang-format off
     opdesc.add_options()
         ("help,h", "produces this help message")
-        ("verbose,v",  po::value<int>()->default_value(0),
-        "print out detailed information for the tests.")
+        ("skip_runtime_fails",  po::value<bool>(&skip_runtime_fails)->default_value(true),
+        "Skip the test if there is a runtime failure.")
+        ("version", "Print queryable version information from the rocfft library and exit")
         ("transformType,t", po::value<fft_transform_type>(&manual_params.transform_type)
          ->default_value(fft_transform_type_complex_forward),
          "Type of transform:\n0) complex forward\n1) complex inverse\n2) real "
@@ -220,7 +278,9 @@
         ("notInPlace,o", "Not in-place FFT transform (default: in-place)")
         ("callback", "Inject load/store callbacks")
         ("checkstride", "Check that data is not written outside of output strides")
-        ("double", "Double precision transform (default: single)")
+        ("double", "Double precision transform (deprecated: use --precision double)")
+        ("precision", po::value<fft_precision>(&manual_params.precision),
+         "Transform precision: single (default), double, half")
         ( "itype", po::value<fft_array_type>(&manual_params.itype)
           ->default_value(fft_array_type_unset),
           "Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) "
@@ -248,39 +308,46 @@
          "Logical size of input buffer.")
         ("osize", po::value<std::vector<size_t>>(&manual_params.osize)->multitoken(),
          "Logical size of output.")
-        ("R", po::value<size_t>(&ramgb)->default_value(get_system_memory_GiB()), "Ram limit in GiB for tests.")
-        ("single_epsilon",  po::value<double>(&single_epsilon)->default_value(3.75e-5)) 
-	("double_epsilon",  po::value<double>(&double_epsilon)->default_value(1e-15))
+        ("R", po::value<size_t>(&ramgb)->default_value((start_memory.total_bytes + ONE_GiB - 1) / ONE_GiB), "Ram limit in GiB for tests.")
+        ("V", po::value<size_t>(&vramgb)->default_value(0), "vram limit in GiB for tests.")
+        ("half_epsilon",  po::value<double>(&half_epsilon)->default_value(9.77e-4))
+        ("single_epsilon",  po::value<double>(&single_epsilon)->default_value(3.75e-5))
+        ("double_epsilon",  po::value<double>(&double_epsilon)->default_value(1e-15))
         ("wise,w", "use FFTW wisdom")
         ("wisdomfile,W",
          po::value<std::string>(&fftw_wisdom_filename)->default_value("wisdom3.txt"),
          "FFTW3 wisdom filename")
         ("scalefactor", po::value<double>(&manual_params.scale_factor), "Scale factor to apply to output.")
         ("token", po::value<std::string>(&test_token)->default_value(""), "Test token name for manual test")
-        ("precompile",  po::value<std::string>(&precompile_file), "Precompile kernels to a file for all test cases before running tests")
-        ("seed", po::value<size_t>(&random_seed), "Random seed; if unset, use an actual random seed.");
+        ("precompile",  po::value<std::string>(&precompile_file), "Precompile kernels to a file for all test cases before running tests");
     // clang-format on
 
-    po::variables_map vm;
     po::store(po::parse_command_line(argc, argv, opdesc), vm);
     po::notify(vm);
 
     if(vm.count("help"))
     {
-        std::cout << opdesc << std::endl;
+        std::cout << opdesc << "\n";
         return 0;
     }
-    verbose = vm["verbose"].as<int>();
 
-    std::cout << "single epsilon: " << single_epsilon << "\tdouble epsilon: " << double_epsilon
-              << std::endl;
+    if(vm.count("version"))
+    {
+        char v[256];
+        rocfft_get_version_string(v, 256);
+        std::cout << "version " << v << "\n";
+        return EXIT_SUCCESS;
+    }
+
+    std::cout << "half epsilon: " << half_epsilon << "\tsingle epsilon: " << single_epsilon
+              << "\tdouble epsilon: " << double_epsilon << "\n";
 
     if(!vm.count("seed"))
     {
         std::random_device dev;
         random_seed = dev();
     }
-    std::cout << "Random seed: " << random_seed << std::endl;
+    std::cout << "Random seed: " << random_seed << "\n";
 
     if(vm.count("wise"))
     {
@@ -302,7 +369,7 @@
     rocfft_setup();
     char v[256];
     rocfft_get_version_string(v, 256);
-    std::cout << "rocFFT version: " << v << std::endl;
+    std::cout << "rocFFT version: " << v << "\n";
 
 #ifdef FFTW_MULTITHREAD
     fftw_init_threads();
@@ -359,7 +426,7 @@
 
     if(test_token != "")
     {
-        std::cout << "Reading fft params from token:\n" << test_token << std::endl;
+        std::cout << "Reading fft params from token:\n" << test_token << "\n";
 
         try
         {
@@ -367,7 +434,7 @@
         }
         catch(...)
         {
-            std::cout << "Unable to parse token." << std::endl;
+            std::cout << "Unable to parse token.\n";
             return 1;
         }
     }
@@ -381,7 +448,8 @@
 
         manual_params.placement
             = vm.count("notInPlace") ? fft_placement_notinplace : fft_placement_inplace;
-        manual_params.precision = vm.count("double") ? fft_precision_double : fft_precision_single;
+        if(vm.count("double"))
+            manual_params.precision = fft_precision_double;
 
         if(vm.count("callback"))
         {
@@ -424,23 +492,27 @@
 
     rocfft_cleanup();
 
-    std::cout << "single precision max l-inf epsilon: " << max_linf_eps_single << std::endl;
-    std::cout << "single precision max l2 epsilon:     " << max_l2_eps_single << std::endl;
-    std::cout << "double precision max l-inf epsilon: " << max_linf_eps_double << std::endl;
-    std::cout << "double precision max l2 epsilon:     " << max_l2_eps_double << std::endl;
+    std::cout << "Random seed: " << random_seed << "\n";
+    std::cout << "half precision max l-inf epsilon: " << max_linf_eps_half << "\n";
+    std::cout << "half precision max l2 epsilon:     " << max_l2_eps_half << "\n";
+    std::cout << "single precision max l-inf epsilon: " << max_linf_eps_single << "\n";
+    std::cout << "single precision max l2 epsilon:     " << max_l2_eps_single << "\n";
+    std::cout << "double precision max l-inf epsilon: " << max_linf_eps_double << "\n";
+    std::cout << "double precision max l2 epsilon:     " << max_l2_eps_double << "\n";
+    std::cout << "Number of runtime issues: " << n_hip_failures << "\n";
 
     return retval;
 }
 
-TEST(manual, vs_fftw)
+TEST(manual, vs_fftw) // MANUAL TESTS HERE
 {
     // Run an individual test using the provided command-line parameters.
     manual_params.validate();
 
     std::cout << "Manual test:"
-              << "\n\t" << manual_params.str("\n\t") << std::endl;
+              << "\n\t" << manual_params.str("\n\t") << "\n";
 
-    std::cout << "Token: " << manual_params.token() << std::endl;
+    std::cout << "Token: " << manual_params.token() << "\n";
 
     if(!manual_params.valid(verbose + 2))
     {
diff -Nru rocfft-5.5.0/clients/tests/hermitian_test.cpp rocfft-5.7.1/clients/tests/hermitian_test.cpp
--- rocfft-5.5.0/clients/tests/hermitian_test.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/hermitian_test.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -19,7 +19,7 @@
 // THE SOFTWARE.
 
 #include "../../shared/gpubuf.h"
-#include "../rocfft_params.h"
+#include "../../shared/rocfft_params.h"
 #include "../samples/rocfft/examplekernels.h"
 #include "../samples/rocfft/exampleutils.h"
 #include "accuracy_test.h"
@@ -54,22 +54,24 @@
 
     ASSERT_TRUE(p.valid(verbose));
 
-    std::vector<std::complex<double>> h_input(p.isize[0]);
+    std::vector<hipDoubleComplex> h_input(p.isize[0]);
 
     std::random_device                     rd;
     std::mt19937                           gen(rd());
     std::uniform_real_distribution<double> dis(0.0, 1.0);
     for(auto& val : h_input)
     {
-        val = std::complex<double>(dis(gen), dis(gen));
+        val.x = dis(gen);
+        val.y = dis(gen);
     }
 
-    if(verbose)
+    if(verbose > 2)
     {
         std::cout << "non-Hermitian input:";
         for(const auto& val : h_input)
         {
-            std::cout << " " << val;
+            std::cout << " "
+                      << "(" << val.x << ", " << val.y << ")";
         }
         std::cout << std::endl;
     }
@@ -94,7 +96,7 @@
 
     ASSERT_TRUE(hipDeviceSynchronize() == hipSuccess);
 
-    if(verbose)
+    if(verbose > 2)
     {
         std::cout << "output:";
         for(const auto& val : h_output)
@@ -104,20 +106,23 @@
         std::cout << std::endl;
     }
 
-    std::vector<std::complex<double>> h_input1 = h_input;
+    std::vector<hipDoubleComplex> h_input1(p.isize[0]);
+    std::copy(h_input.begin(), h_input.end(), h_input1.begin());
 
     // Impose Hermitian symmetry on the input:
-    h_input1[0].imag(0.0);
+    h_input1[0].y = 0.0;
+
     if(p.length[0] % 2 == 0)
     {
-        h_input1.back().imag(0.0);
+        h_input1.back().y = 0.0;
     }
-    if(verbose)
+    if(verbose > 2)
     {
         std::cout << "Hermitian input:";
         for(const auto& val : h_input1)
         {
-            std::cout << " " << val;
+            std::cout << " "
+                      << "(" << val.x << ", " << val.y << ")";
         }
         std::cout << std::endl;
     }
@@ -125,7 +130,8 @@
     double maxdiff = 0.0;
     for(unsigned int i = 0; i < h_input.size(); ++i)
     {
-        auto val = std::abs(h_input[i] - h_input1[i]);
+        auto val = std::abs(
+            rocfft_complex<double>(h_input[i].x - h_input1[i].x, h_input[i].y - h_input1[i].y));
         if(val > maxdiff)
             maxdiff = val;
     }
@@ -138,7 +144,7 @@
     ASSERT_TRUE(hipMemcpy(h_output1.data(), obuf.data(), obuf.size(), hipMemcpyDeviceToHost)
                 == hipSuccess);
 
-    if(verbose)
+    if(verbose > 2)
     {
         std::cout << "output:";
         for(const auto& val : h_output1)
@@ -227,8 +233,8 @@
 
         // Data buffers:
         gpubuf buf;
-        ASSERT_TRUE(buf.alloc(sizeof(std::complex<double>) * p.isize[0]) == hipSuccess);
-        std::vector<std::complex<double>> hbuf(p.isize[0]);
+        ASSERT_TRUE(buf.alloc(sizeof(hipDoubleComplex) * p.isize[0]) == hipSuccess);
+        std::vector<hipDoubleComplex> hbuf(p.isize[0]);
 
         // Initialize a Hermitian-symmetric array; it should be symmetric.
         init_hermitiancomplex_cm(p.length_cm(), p.ilength_cm(), p.istride_cm(), buf.data());
@@ -249,7 +255,8 @@
         std::uniform_real_distribution<double> unif(0, 1);
         for(auto& v : hbuf)
         {
-            v = std::complex<double>(unif(rng), unif(rng));
+            v.x = unif(rng);
+            v.y = unif(rng);
         }
         if(verbose > 2)
         {
@@ -288,15 +295,18 @@
 
         ASSERT_TRUE(p.execute(pibuf.data(), pobuf.data()) == fft_status_success);
 
-        std::vector<std::complex<double>> h_output(p.osize[0]);
-        std::fill(h_output.begin(), h_output.end(), 0.0);
+        std::vector<hipDoubleComplex> h_output(p.osize[0]);
+        std::fill(h_output.begin(), h_output.end(), hipDoubleComplex{0.0, 0.0});
+
         ASSERT_TRUE(
             hipMemcpy(h_output.data(), obuf.data(), p.obuffer_sizes()[0], hipMemcpyDeviceToHost)
             == hipSuccess);
 
         impose_hermitian_symmetry_cm(p.length_cm(), p.olength_cm(), p.ostride_cm(), obuf.data());
-        std::vector<std::complex<double>> h_output_resym(p.osize[0]);
-        std::fill(h_output_resym.begin(), h_output_resym.end(), 0.0);
+
+        std::vector<hipDoubleComplex> h_output_resym(p.osize[0]);
+        std::fill(h_output_resym.begin(), h_output_resym.end(), hipDoubleComplex{0.0, 0.0});
+
         ASSERT_TRUE(
             hipMemcpy(
                 h_output_resym.data(), obuf.data(), p.obuffer_sizes()[0], hipMemcpyDeviceToHost)
@@ -305,8 +315,8 @@
         double maxdiff = 0;
         for(unsigned int i = 0; i < h_output.size(); ++i)
         {
-            auto rdiff = std::abs(h_output[i].real() - h_output_resym[i].real());
-            auto idiff = std::abs(h_output[i].imag() - h_output_resym[i].imag());
+            auto rdiff = std::abs(h_output[i].x - h_output_resym[i].x);
+            auto idiff = std::abs(h_output[i].y - h_output_resym[i].y);
             maxdiff    = std::max({maxdiff, rdiff, idiff});
         }
 
diff -Nru rocfft-5.5.0/clients/tests/hipGraph_test.cpp rocfft-5.7.1/clients/tests/hipGraph_test.cpp
--- rocfft-5.5.0/clients/tests/hipGraph_test.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/hipGraph_test.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -20,7 +20,7 @@
 
 #include "../../shared/arithmetic.h"
 #include "../../shared/gpubuf.h"
-#include "../rocfft_params.h"
+#include "../../shared/rocfft_params.h"
 #include "accuracy_test.h"
 #include "rocfft.h"
 #include "rocfft_against_fftw.h"
@@ -32,7 +32,7 @@
 
 static const unsigned int KERNEL_THREADS = 64;
 
-__global__ void scale_data_kernel(float2* data, size_t length, float scale)
+__global__ void scale_data_kernel(rocfft_complex<float>* data, size_t length, float scale)
 {
     const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -44,7 +44,19 @@
 }
 
 template <typename T>
-__global__ void offset_data_kernel(T* data, size_t length, T offset)
+__global__ void offset_data_kernel_complex(T* data, size_t length, T offset)
+{
+    const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if(idx < length)
+    {
+        data[idx].x += offset.x;
+        data[idx].y += offset.y;
+    }
+}
+
+template <typename T>
+__global__ void offset_data_kernel_real(T* data, size_t length, T offset)
 {
     const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -54,10 +66,10 @@
     }
 }
 
-static void init_input_data(size_t               N,
-                            size_t               seed,
-                            std::vector<float2>& host_data,
-                            gpubuf_t<float2>&    device_data)
+static void init_input_data(size_t                              N,
+                            size_t                              seed,
+                            std::vector<rocfft_complex<float>>& host_data,
+                            gpubuf_t<rocfft_complex<float>>&    device_data)
 {
     std::minstd_rand                      gen(seed);
     std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
@@ -69,7 +81,7 @@
         host_data[i].y = dist(gen);
     }
 
-    size_t Nbytes = N * sizeof(float2);
+    size_t Nbytes = N * sizeof(rocfft_complex<float>);
 
     if(device_data.alloc(Nbytes) != hipSuccess)
         throw std::bad_alloc();
@@ -146,7 +158,8 @@
     ASSERT_EQ(rocfft_execute(plan_inv, &in_ptr, &out_ptr, info), rocfft_status_success);
 }
 
-static void scale_device_data(hipStream_t stream, float scale, size_t N, float2* data)
+static void
+    scale_device_data(hipStream_t stream, float scale, size_t N, rocfft_complex<float>* data)
 {
     auto blockSize = KERNEL_THREADS;
     auto numBlocks = DivRoundingUp<size_t>(N, blockSize);
@@ -161,11 +174,26 @@
 }
 
 template <typename T>
-static void offset_device_data(hipStream_t stream, T offset, size_t N, T* data)
+static void offset_device_data_real(hipStream_t stream, T offset, size_t N, T* data)
+{
+    auto blockSize = KERNEL_THREADS;
+    auto numBlocks = DivRoundingUp<size_t>(N, blockSize);
+    hipLaunchKernelGGL(offset_data_kernel_real<T>,
+                       dim3(numBlocks),
+                       dim3(blockSize),
+                       0, // sharedMemBytes
+                       stream, // stream
+                       data,
+                       N,
+                       offset);
+}
+
+template <typename T>
+static void offset_device_data_complex(hipStream_t stream, T offset, size_t N, T* data)
 {
     auto blockSize = KERNEL_THREADS;
     auto numBlocks = DivRoundingUp<size_t>(N, blockSize);
-    hipLaunchKernelGGL(offset_data_kernel<T>,
+    hipLaunchKernelGGL(offset_data_kernel_complex<T>,
                        dim3(numBlocks),
                        dim3(blockSize),
                        0, // sharedMemBytes
@@ -196,34 +224,33 @@
     ASSERT_EQ(host_data == host_data_compare, true);
 }
 
-static void compare_data(const std::vector<float2>& original_host_data,
-                         const gpubuf_t<float2>&    modified_device_data)
+static void compare_data(const std::vector<rocfft_complex<float>>& original_host_data,
+                         const gpubuf_t<rocfft_complex<float>>&    modified_device_data)
 {
-    std::vector<float2> modified_host_data(original_host_data.size());
+    std::vector<rocfft_complex<float>> modified_host_data(original_host_data.size());
 
     // Copy result back to host
     ASSERT_EQ(hipMemcpy(modified_host_data.data(),
                         modified_device_data.data(),
-                        modified_host_data.size() * sizeof(float2),
+                        modified_host_data.size() * sizeof(rocfft_complex<float>),
                         hipMemcpyDeviceToHost),
               hipSuccess);
 
     // Compare data we got to the original.
     // We're running 2 transforms (forward+inverse), so we
     // should tolerate 2x the error of a single transform.
-    std::vector<std::pair<size_t, size_t>> linf_failures;
-    const double                           MAX_TRANSFORM_ERROR = 2 * type_epsilon<float>();
+    const double MAX_TRANSFORM_ERROR = 2 * type_epsilon<float>();
 
     auto input_norm
-        = norm_complex(reinterpret_cast<const std::complex<float>*>(original_host_data.data()),
+        = norm_complex(reinterpret_cast<const rocfft_complex<float>*>(original_host_data.data()),
                        original_host_data.size(),
                        1,
                        1,
                        original_host_data.size(),
                        {0});
     auto diff = distance_1to1_complex(
-        reinterpret_cast<const std::complex<float>*>(original_host_data.data()),
-        reinterpret_cast<const std::complex<float>*>(modified_host_data.data()),
+        reinterpret_cast<const rocfft_complex<float>*>(original_host_data.data()),
+        reinterpret_cast<const rocfft_complex<float>*>(modified_host_data.data()),
         // data is all contiguous, we can treat it as 1d
         original_host_data.size(),
         1,
@@ -231,7 +258,7 @@
         original_host_data.size(),
         1,
         modified_host_data.size(),
-        linf_failures,
+        nullptr,
         MAX_TRANSFORM_ERROR,
         {0},
         {0});
@@ -253,26 +280,26 @@
 
     size_t seed = 100;
 
-    auto offset_1 = float2(.1, .1);
-    auto offset_2 = float2(-.1, -.1);
+    auto offset_1 = rocfft_complex<float>{.1, .1};
+    auto offset_2 = rocfft_complex<float>{-.1, -.1};
 
     float scale     = 2.2;
     float inv_scale = 1. / scale;
 
-    auto output_init_val = float2(0., 0.);
+    auto output_init_val = rocfft_complex<float>(0., 0.);
 
     size_t num_kernel_launches = 100;
     size_t num_graph_launches  = 10;
 
-    gpubuf_t<float2>    device_mem_in;
-    std::vector<float2> host_mem_in;
+    gpubuf_t<rocfft_complex<float>>    device_mem_in;
+    std::vector<rocfft_complex<float>> host_mem_in;
     init_input_data(N, seed, host_mem_in, device_mem_in);
-    float2* in_ptr = static_cast<float2*>(device_mem_in.data());
+    rocfft_complex<float>* in_ptr = static_cast<rocfft_complex<float>*>(device_mem_in.data());
 
-    gpubuf_t<float2>    device_mem_out;
-    std::vector<float2> host_mem_out;
-    init_data<float2>(N, output_init_val, host_mem_out, device_mem_out);
-    float2* out_ptr = static_cast<float2*>(device_mem_out.data());
+    gpubuf_t<rocfft_complex<float>>    device_mem_out;
+    std::vector<rocfft_complex<float>> host_mem_out;
+    init_data<rocfft_complex<float>>(N, output_init_val, host_mem_out, device_mem_out);
+    rocfft_complex<float>* out_ptr = static_cast<rocfft_complex<float>*>(device_mem_out.data());
 
     gpubuf_t<size_t>    device_mem_counter;
     std::vector<size_t> host_mem_counter;
@@ -297,11 +324,11 @@
 
     // add offset to device input data
     for(size_t i = 0; i < num_kernel_launches; ++i)
-        offset_device_data<float2>(stream, offset_1, N, in_ptr);
+        offset_device_data_complex<rocfft_complex<float>>(stream, offset_1, N, in_ptr);
 
     // back out the offsets
     for(size_t i = 0; i < num_kernel_launches; ++i)
-        offset_device_data<float2>(stream, offset_2, N, in_ptr);
+        offset_device_data_complex<rocfft_complex<float>>(stream, offset_2, N, in_ptr);
 
     // scale the device input data
     scale_device_data(stream, scale, N, in_ptr);
@@ -326,20 +353,20 @@
 
     // add offset to device output data
     for(size_t i = 0; i < num_kernel_launches; ++i)
-        offset_device_data<float2>(stream, offset_1, N, out_ptr);
+        offset_device_data_complex<rocfft_complex<float>>(stream, offset_1, N, out_ptr);
 
     // back out the offsets
     for(size_t i = 0; i < num_kernel_launches; ++i)
-        offset_device_data<float2>(stream, offset_2, N, out_ptr);
+        offset_device_data_complex<rocfft_complex<float>>(stream, offset_2, N, out_ptr);
 
     // increment counter
-    offset_device_data<size_t>(stream, 1, N, counter_ptr);
+    offset_device_data_real<size_t>(stream, 1, N, counter_ptr);
 
     ASSERT_EQ(hipStreamEndCapture(stream, &graph), hipSuccess);
 
     // make sure no actual work has been done for
     // the captured stream before graph execution
-    compare_data_exact_match<float2>(other_stream, host_mem_out, device_mem_out);
+    compare_data_exact_match<rocfft_complex<float>>(other_stream, host_mem_out, device_mem_out);
 
     ASSERT_EQ(hipGraphInstantiate(&graph_exec, graph, NULL, NULL, 0), hipSuccess);
     ASSERT_EQ(hipGraphDestroy(graph), hipSuccess);
diff -Nru rocfft-5.5.0/clients/tests/multithread_test.cpp rocfft-5.7.1/clients/tests/multithread_test.cpp
--- rocfft-5.5.0/clients/tests/multithread_test.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/multithread_test.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -19,7 +19,7 @@
 // THE SOFTWARE.
 
 #include "../../shared/gpubuf.h"
-#include "../rocfft_params.h"
+#include "../../shared/rocfft_params.h"
 #include "accuracy_test.h"
 #include "rocfft.h"
 #include "rocfft_against_fftw.h"
@@ -32,7 +32,7 @@
 
 // normalize results of an inverse transform, so it can be directly
 // compared to the original data before the forward transform
-__global__ void normalize_inverse_results(float2* array, float N)
+__global__ void normalize_inverse_results(rocfft_complex<float>* array, float N)
 {
     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
     array[idx].x /= N;
@@ -58,7 +58,7 @@
             datasize *= N;
         }
 
-        size_t Nbytes = datasize * sizeof(float2);
+        size_t Nbytes = datasize * sizeof(rocfft_complex<float>);
 
         // Create HIP device buffers
         if(device_mem_in.alloc(Nbytes) != hipSuccess)
@@ -156,7 +156,7 @@
                            1,
                            0, // sharedMemBytes
                            stream, // stream
-                           static_cast<float2*>(device_mem_out.data()),
+                           static_cast<rocfft_complex<float>*>(device_mem_out.data()),
                            static_cast<float>(host_mem_out.size()));
         ran_transform = true;
     }
@@ -190,26 +190,25 @@
 
             ASSERT_EQ(hipMemcpy(host_mem_out.data(),
                                 device_mem_out.data(),
-                                host_mem_out.size() * sizeof(float2),
+                                host_mem_out.size() * sizeof(rocfft_complex<float>),
                                 hipMemcpyDeviceToHost),
                       hipSuccess);
 
             // Compare data we got to the original.
             // We're running 2 transforms (forward+inverse), so we
             // should tolerate 2x the error of a single transform.
-            std::vector<std::pair<size_t, size_t>> linf_failures;
-            const double                           MAX_TRANSFORM_ERROR = 2 * type_epsilon<float>();
+            const double MAX_TRANSFORM_ERROR = 2 * type_epsilon<float>();
 
             auto input_norm
-                = norm_complex(reinterpret_cast<const std::complex<float>*>(host_mem_in.data()),
+                = norm_complex(reinterpret_cast<const rocfft_complex<float>*>(host_mem_in.data()),
                                host_mem_in.size(),
                                1,
                                1,
                                host_mem_in.size(),
                                {0});
             auto diff = distance_1to1_complex(
-                reinterpret_cast<const std::complex<float>*>(host_mem_in.data()),
-                reinterpret_cast<const std::complex<float>*>(host_mem_out.data()),
+                reinterpret_cast<const rocfft_complex<float>*>(host_mem_in.data()),
+                reinterpret_cast<const rocfft_complex<float>*>(host_mem_out.data()),
                 // data is all contiguous, we can treat it as 1d
                 host_mem_in.size(),
                 1,
@@ -217,7 +216,7 @@
                 host_mem_in.size(),
                 1,
                 host_mem_out.size(),
-                linf_failures,
+                nullptr,
                 MAX_TRANSFORM_ERROR,
                 {0},
                 {0});
@@ -236,18 +235,18 @@
     {
         do_cleanup();
     }
-    size_t              N                = 0;
-    size_t              dim              = 0;
-    uint32_t            seed             = 0;
-    hipStream_t         stream           = nullptr;
-    rocfft_plan         plan             = nullptr;
-    rocfft_plan         plan_inv         = nullptr;
-    size_t              work_buffer_size = 0;
-    void*               work_buffer      = nullptr;
-    gpubuf              device_mem_in;
-    gpubuf              device_mem_out;
-    std::vector<float2> host_mem_in;
-    std::vector<float2> host_mem_out;
+    size_t                             N                = 0;
+    size_t                             dim              = 0;
+    uint32_t                           seed             = 0;
+    hipStream_t                        stream           = nullptr;
+    rocfft_plan                        plan             = nullptr;
+    rocfft_plan                        plan_inv         = nullptr;
+    size_t                             work_buffer_size = 0;
+    void*                              work_buffer      = nullptr;
+    gpubuf                             device_mem_in;
+    gpubuf                             device_mem_out;
+    std::vector<rocfft_complex<float>> host_mem_in;
+    std::vector<rocfft_complex<float>> host_mem_out;
 
     // ensure that we don't forget to actually run the transform
     bool ran_transform = false;
diff -Nru rocfft-5.5.0/clients/tests/random.cpp rocfft-5.7.1/clients/tests/random.cpp
--- rocfft-5.5.0/clients/tests/random.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/random.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -22,7 +22,6 @@
 #include <iostream>
 #include <random>
 
-#include "../rocfft_params.h"
 #include "accuracy_test.h"
 #include "rocfft_accuracy_test.h"
 
@@ -93,14 +92,14 @@
         std::cout << "Params are not valid\n";
     }
 
-    fft_vs_reference(params);
+    fft_vs_reference(params, true);
 }
 
 INSTANTIATE_TEST_SUITE_P(random_complex_1d,
                          random_params,
                          ::testing::Combine(::testing::Range(0, n_random_tests),
                                             ::testing::ValuesIn({1}),
-                                            ::testing::ValuesIn(precision_range),
+                                            ::testing::ValuesIn(precision_range_sp_dp),
                                             ::testing::ValuesIn(place_range),
                                             ::testing::ValuesIn(trans_type_range_complex)));
 
@@ -108,7 +107,7 @@
                          random_params,
                          ::testing::Combine(::testing::Range(0, n_random_tests),
                                             ::testing::ValuesIn({2}),
-                                            ::testing::ValuesIn(precision_range),
+                                            ::testing::ValuesIn(precision_range_sp_dp),
                                             ::testing::ValuesIn(place_range),
                                             ::testing::ValuesIn(trans_type_range_complex)));
 
@@ -116,7 +115,7 @@
                          random_params,
                          ::testing::Combine(::testing::Range(0, n_random_tests),
                                             ::testing::ValuesIn({3}),
-                                            ::testing::ValuesIn(precision_range),
+                                            ::testing::ValuesIn(precision_range_sp_dp),
                                             ::testing::ValuesIn(place_range),
                                             ::testing::ValuesIn(trans_type_range_complex)));
 
@@ -124,7 +123,7 @@
                          random_params,
                          ::testing::Combine(::testing::Range(0, n_random_tests),
                                             ::testing::ValuesIn({1}),
-                                            ::testing::ValuesIn(precision_range),
+                                            ::testing::ValuesIn(precision_range_sp_dp),
                                             ::testing::ValuesIn({fft_placement_notinplace}),
                                             ::testing::ValuesIn(trans_type_range_real)));
 
@@ -132,7 +131,7 @@
                          random_params,
                          ::testing::Combine(::testing::Range(0, n_random_tests),
                                             ::testing::ValuesIn({2}),
-                                            ::testing::ValuesIn(precision_range),
+                                            ::testing::ValuesIn(precision_range_sp_dp),
                                             ::testing::ValuesIn({fft_placement_notinplace}),
                                             ::testing::ValuesIn(trans_type_range_real)));
 
@@ -140,6 +139,6 @@
                          random_params,
                          ::testing::Combine(::testing::Range(0, n_random_tests),
                                             ::testing::ValuesIn({3}),
-                                            ::testing::ValuesIn(precision_range),
+                                            ::testing::ValuesIn(precision_range_sp_dp),
                                             ::testing::ValuesIn({fft_placement_notinplace}),
                                             ::testing::ValuesIn(trans_type_range_real)));
diff -Nru rocfft-5.5.0/clients/tests/rocfft_accuracy_test.cpp rocfft-5.7.1/clients/tests/rocfft_accuracy_test.cpp
--- rocfft-5.5.0/clients/tests/rocfft_accuracy_test.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/rocfft_accuracy_test.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2022 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -25,22 +25,25 @@
 #include <utility>
 #include <vector>
 
+#include "rocfft_accuracy_test.h"
+
 #include "../../shared/gpubuf.h"
-#include "../rocfft_params.h"
 #include "fftw_transform.h"
 #include "rocfft.h"
-#include "rocfft_accuracy_test.h"
 #include "rocfft_against_fftw.h"
 
-void fft_vs_reference(rocfft_params& params)
+void fft_vs_reference(rocfft_params& params, bool round_trip)
 {
     switch(params.precision)
     {
+    case fft_precision_half:
+        fft_vs_reference_impl<_Float16, rocfft_params>(params, round_trip);
+        break;
     case fft_precision_single:
-        fft_vs_reference_impl<float, rocfft_params>(params);
+        fft_vs_reference_impl<float, rocfft_params>(params, round_trip);
         break;
     case fft_precision_double:
-        fft_vs_reference_impl<double, rocfft_params>(params);
+        fft_vs_reference_impl<double, rocfft_params>(params, round_trip);
         break;
     }
 }
@@ -68,6 +71,6 @@
         GTEST_SKIP();
     }
 
-    fft_vs_reference(params);
+    fft_vs_reference(params, true);
     SUCCEED();
 }
diff -Nru rocfft-5.5.0/clients/tests/rocfft_accuracy_test.h rocfft-5.7.1/clients/tests/rocfft_accuracy_test.h
--- rocfft-5.5.0/clients/tests/rocfft_accuracy_test.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/rocfft_accuracy_test.h	2023-08-09 16:19:51.000000000 +0000
@@ -18,15 +18,12 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#pragma once
-
 #ifndef ROCFFT_ACCURACY_TEST
 #define ROCFFT_ACCURACY_TEST
 
-#include "../rocfft_params.h"
+#include "../../shared/rocfft_params.h"
 #include "accuracy_test.h"
-#include "rocfft_accuracy_test.h"
 
-void fft_vs_reference(rocfft_params& params);
+void fft_vs_reference(rocfft_params& params, bool round_trip = false);
 
 #endif
diff -Nru rocfft-5.5.0/clients/tests/rocfft_against_fftw.h rocfft-5.7.1/clients/tests/rocfft_against_fftw.h
--- rocfft-5.5.0/clients/tests/rocfft_against_fftw.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/rocfft_against_fftw.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -107,17 +107,17 @@
 
 // construct an FFTW plan, given rocFFT parameters.  output is
 // required if planning with wisdom.
-template <typename Tfloat, typename Tallocator>
+template <typename Tfloat>
 static typename fftw_trait<Tfloat>::fftw_plan_type
-    fftw_plan_via_rocfft(const std::vector<size_t>&                  length,
-                         const std::vector<size_t>&                  istride,
-                         const std::vector<size_t>&                  ostride,
-                         const size_t                                nbatch,
-                         const size_t                                idist,
-                         const size_t                                odist,
-                         const fft_transform_type                    transformType,
-                         std::vector<std::vector<char, Tallocator>>& input,
-                         std::vector<std::vector<char, Tallocator>>& output)
+    fftw_plan_via_rocfft(const std::vector<size_t>& length,
+                         const std::vector<size_t>& istride,
+                         const std::vector<size_t>& ostride,
+                         const size_t               nbatch,
+                         const size_t               idist,
+                         const size_t               odist,
+                         const fft_transform_type   transformType,
+                         std::vector<hostbuf>&      input,
+                         std::vector<hostbuf>&      output)
 {
     // Dimension configuration:
     std::vector<fftw_iodim64> dims(length.size());
@@ -145,39 +145,29 @@
 template <typename Tfloat>
 void fftw_run(fft_transform_type                          transformType,
               typename fftw_trait<Tfloat>::fftw_plan_type cpu_plan,
-              void*                                       cpu_in,
-              void*                                       cpu_out)
+              std::vector<hostbuf>&                       cpu_in,
+              std::vector<hostbuf>&                       cpu_out)
 {
-    using fftw_complex_type = typename fftw_trait<Tfloat>::fftw_complex_type;
-
     switch(transformType)
     {
     case fft_transform_type_complex_forward:
     {
-        fftw_plan_execute_c2c<Tfloat>(cpu_plan,
-                                      reinterpret_cast<fftw_complex_type*>(cpu_in),
-                                      reinterpret_cast<fftw_complex_type*>(cpu_out));
+        fftw_plan_execute_c2c<Tfloat>(cpu_plan, cpu_in, cpu_out);
         break;
     }
     case fft_transform_type_complex_inverse:
     {
-        fftw_plan_execute_c2c<Tfloat>(cpu_plan,
-                                      reinterpret_cast<fftw_complex_type*>(cpu_in),
-                                      reinterpret_cast<fftw_complex_type*>(cpu_out));
+        fftw_plan_execute_c2c<Tfloat>(cpu_plan, cpu_in, cpu_out);
         break;
     }
     case fft_transform_type_real_forward:
     {
-        fftw_plan_execute_r2c<Tfloat>(cpu_plan,
-                                      reinterpret_cast<Tfloat*>(cpu_in),
-                                      reinterpret_cast<fftw_complex_type*>(cpu_out));
+        fftw_plan_execute_r2c<Tfloat>(cpu_plan, cpu_in, cpu_out);
         break;
     }
     case fft_transform_type_real_inverse:
     {
-        fftw_plan_execute_c2r<Tfloat>(cpu_plan,
-                                      reinterpret_cast<fftw_complex_type*>(cpu_in),
-                                      reinterpret_cast<Tfloat*>(cpu_out));
+        fftw_plan_execute_c2r<Tfloat>(cpu_plan, cpu_in, cpu_out);
         break;
     }
     }
@@ -224,6 +214,9 @@
 {
     switch(precision)
     {
+    case fft_precision_half:
+        return type_epsilon<_Float16>();
+        break;
     case fft_precision_single:
         return type_epsilon<float>();
         break;
@@ -232,7 +225,6 @@
         break;
     default:
         throw std::runtime_error("Invalid precision");
-        return 0.0;
     }
 }
 
diff -Nru rocfft-5.5.0/clients/tests/rtc_helper_crash.cpp rocfft-5.7.1/clients/tests/rtc_helper_crash.cpp
--- rocfft-5.5.0/clients/tests/rtc_helper_crash.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/rtc_helper_crash.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,3 +1,23 @@
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
 // just crash
 int main()
 {
diff -Nru rocfft-5.5.0/clients/tests/test_params.h rocfft-5.7.1/clients/tests/test_params.h
--- rocfft-5.5.0/clients/tests/test_params.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/test_params.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -27,14 +27,25 @@
 #include <stdexcept>
 
 extern int    verbose;
-extern size_t random_seed;
 extern size_t ramgb;
+extern size_t vramgb;
+
+extern size_t random_seed;
+extern double planar_prob;
+extern double callback_prob;
+
+extern double half_epsilon;
 extern double single_epsilon;
 extern double double_epsilon;
+extern bool   skip_runtime_fails;
 
 extern double max_linf_eps_double;
 extern double max_l2_eps_double;
 extern double max_linf_eps_single;
 extern double max_l2_eps_single;
+extern double max_linf_eps_half;
+extern double max_l2_eps_half;
+
+extern int n_hip_failures;
 
 #endif
diff -Nru rocfft-5.5.0/clients/tests/unit_test.cpp rocfft-5.7.1/clients/tests/unit_test.cpp
--- rocfft-5.5.0/clients/tests/unit_test.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/clients/tests/unit_test.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -22,8 +22,8 @@
 
 #include "../../shared/environment.h"
 #include "../../shared/gpubuf.h"
+#include "../../shared/rocfft_complex.h"
 #include "hip/hip_runtime_api.h"
-#include "hip/hip_vector_types.h"
 #include <boost/scope_exit.hpp>
 #include <condition_variable>
 #include <fstream>
@@ -177,7 +177,7 @@
 
     size_t requested_work_size = 0;
     ASSERT_EQ(rocfft_plan_get_work_buffer_size(plan, &requested_work_size), rocfft_status_success);
-    ASSERT_GT(requested_work_size, 0);
+    ASSERT_GT(requested_work_size, 0U);
 
     rocfft_execution_info info;
     ASSERT_EQ(rocfft_execution_info_create(&info), rocfft_status_success);
@@ -311,8 +311,8 @@
         rocfft_plan_destroy(plan);
         plan = nullptr;
     };
-    // check the RTC log to see if a kernel got compiled
-    auto kernel_was_compiled = [&]() {
+    // check the RTC log to see if an FFT kernel got compiled
+    auto fft_kernel_was_compiled = [&]() {
         // HACK: logging is done in a worker thread, so sleep for a
         // bit to give it a chance to actually write.  It at least
         // should flush after writing.
@@ -320,9 +320,10 @@
         // look for a ROCFFT_RTC_BEGIN line that indicates RTC happened
         std::ifstream logfile(rtc_log_path);
         std::string   line;
-        while(logfile >> line)
+        while(std::getline(logfile, line))
         {
-            if(line.find("ROCFFT_RTC_BEGIN") != std::string::npos)
+            if(line.find("ROCFFT_RTC_BEGIN") != std::string::npos
+               && line.find("fft_") != std::string::npos)
                 return true;
         }
         return false;
@@ -334,7 +335,7 @@
     ASSERT_EQ(rocfft_cache_serialize(&onekernel_cache, &onekernel_cache_bytes),
               rocfft_status_success);
     rocfft_cleanup();
-    ASSERT_TRUE(kernel_was_compiled());
+    ASSERT_TRUE(fft_kernel_was_compiled());
 
     // serialized cache should be bigger than empty cache
     ASSERT_GT(onekernel_cache_bytes, empty_cache_bytes);
@@ -349,7 +350,7 @@
     ASSERT_EQ(rocfft_cache_serialize(&onekernel_cache, &onekernel_cache_bytes),
               rocfft_status_success);
     rocfft_cleanup();
-    ASSERT_TRUE(kernel_was_compiled());
+    ASSERT_TRUE(fft_kernel_was_compiled());
     ASSERT_GT(onekernel_cache_bytes, empty_cache_bytes);
 
     // re-init library without blowing away cache.  rebuild plan and
@@ -357,7 +358,7 @@
     rocfft_setup();
     build_plan();
     rocfft_cleanup();
-    ASSERT_FALSE(kernel_was_compiled());
+    ASSERT_FALSE(fft_kernel_was_compiled());
 
     // blow away cache again, deserialize one-kernel cache.  re-init
     // library and rebuild plan - kernel should again not be
@@ -367,12 +368,12 @@
     ASSERT_EQ(rocfft_cache_deserialize(onekernel_cache, onekernel_cache_bytes),
               rocfft_status_success);
     rocfft_cleanup();
-    ASSERT_FALSE(kernel_was_compiled());
+    ASSERT_FALSE(fft_kernel_was_compiled());
 
     rocfft_setup();
     build_plan();
     rocfft_cleanup();
-    ASSERT_FALSE(kernel_was_compiled());
+    ASSERT_FALSE(fft_kernel_was_compiled());
 
     // use the cache as a system cache and make the user one an empty
     // in-memory cache.  kernel should still not be recompiled.
@@ -381,7 +382,7 @@
     rocfft_setup();
     build_plan();
     rocfft_cleanup();
-    ASSERT_FALSE(kernel_was_compiled());
+    ASSERT_FALSE(fft_kernel_was_compiled());
 
     // check that the system cache is not written to, even if it's
     // writable by the current user.  after removing the cache, the
@@ -391,11 +392,11 @@
     rocfft_setup();
     build_plan();
     rocfft_cleanup();
-    ASSERT_TRUE(kernel_was_compiled());
+    ASSERT_TRUE(fft_kernel_was_compiled());
     rocfft_setup();
     build_plan();
     rocfft_cleanup();
-    ASSERT_TRUE(kernel_was_compiled());
+    ASSERT_TRUE(fft_kernel_was_compiled());
 }
 
 // make sure cache API functions tolerate null pointers without crashing
@@ -443,8 +444,8 @@
                                       nullptr));
 
     // alloc a complex buffer
-    gpubuf_t<float2> data;
-    ASSERT_EQ(data.alloc(RTC_PROBLEM_SIZE * sizeof(float2)), hipSuccess);
+    gpubuf_t<rocfft_complex<float>> data;
+    ASSERT_EQ(data.alloc(RTC_PROBLEM_SIZE * sizeof(rocfft_complex<float>)), hipSuccess);
 
     std::vector<void*> ibuffers(1, static_cast<void*>(data.data()));
 
diff -Nru rocfft-5.5.0/debian/changelog rocfft-5.7.1/debian/changelog
--- rocfft-5.5.0/debian/changelog	2023-12-19 14:59:07.000000000 +0000
+++ rocfft-5.7.1/debian/changelog	2024-03-12 17:15:10.000000000 +0000
@@ -1,8 +1,46 @@
-rocfft (5.5.0-6build1) noble; urgency=medium
+rocfft (5.7.1-1) unstable; urgency=medium
 
-  * No-change rebuild for boost defaults change.
+  * Migrate to unstable
+  * New upstream version includes fix for LDS over-allocation
+    (Closes: #1057251)
 
- -- Matthias Klose <doko@ubuntu.com>  Tue, 19 Dec 2023 15:59:07 +0100
+ -- Cordell Bloor <cgmb@slerp.xyz>  Tue, 12 Mar 2024 11:15:10 -0600
+
+rocfft (5.7.1-1~exp2) experimental; urgency=medium
+
+  * d/patches: Add missing DEP-3 headers
+  * d/control: Re-add accidentally removed B-D libhiprand-dev.
+    Fixes a FTBFS.
+  * symbols: Strip Debian revision
+
+ -- Christian Kastner <ckk@debian.org>  Sat, 02 Mar 2024 21:12:10 +0100
+
+rocfft (5.7.1-1~exp1) experimental; urgency=medium
+
+  * New upstream version.
+    - Update symbols file
+    - Refresh patches
+    - Add patch use-readthedocs-theme.patch
+      Restores documentation build using the simpler approach from a previous
+      version
+  * d/rules:
+    - Add gfx1100, gfx1101 and gfx1102 build targets
+    - Drop patchelf --remove-rpath from build rules
+    - Automate handling of rocFFT version string
+  * d/control:
+    - Constrain versions for clang-17
+    - Drop unused rocminfo package
+    - Switch B-D from librocrand-dev to libhiprand-dev
+    - Add support for the 'nocheck' build profile
+    - Temporarily B-D on libamdhip64-dev >= 5.6.1
+      Until either bin:hipcc is fixed or it is determined that a direct
+      dependency on libamdhip64-dev is the right thing to do.
+  * Upstream URL has changed
+  * autopkgtest: Export dmesg and other info as artifacts
+  * Bump copyrights
+  * dbgsym: Disable dwz and switch to compressed DWARF-5
+
+ -- Christian Kastner <ckk@debian.org>  Fri, 01 Mar 2024 23:23:05 +0100
 
 rocfft (5.5.0-6) unstable; urgency=medium
 
diff -Nru rocfft-5.5.0/debian/clean rocfft-5.7.1/debian/clean
--- rocfft-5.5.0/debian/clean	2023-11-10 09:02:29.000000000 +0000
+++ rocfft-5.7.1/debian/clean	2024-03-12 17:13:18.000000000 +0000
@@ -1,2 +1,2 @@
-docs/docBin/
+docs/.doxygen/docBin/
 html/
diff -Nru rocfft-5.5.0/debian/control rocfft-5.7.1/debian/control
--- rocfft-5.5.0/debian/control	2023-11-10 09:02:29.000000000 +0000
+++ rocfft-5.7.1/debian/control	2024-03-12 17:13:18.000000000 +0000
@@ -1,6 +1,6 @@
 Source: rocfft
 Section: devel
-Homepage: https://github.com/rocmsoftwareplatform/rocfft
+Homepage: https://github.com/ROCm/rocfft
 Priority: optional
 Standards-Version: 4.6.2
 Vcs-Git: https://salsa.debian.org/rocm-team/rocfft.git
@@ -11,18 +11,19 @@
            Christian Kastner <ckk@debian.org>,
 Build-Depends: debhelper-compat (= 13),
                cmake,
-               hipcc,
-               libamd-comgr-dev,
-               libhsa-runtime-dev,
-               rocminfo,
-               patchelf,
+               hipcc (>= 5.6.1~),
+# ckk 2024-03-02: temporary until hipcc question is resolved:
+               libamdhip64-dev (>= 5.6.1~),
+# end
+               libamd-comgr-dev (>= 6.0~),
+               libhsa-runtime-dev (>= 5.7.1~),
                rocm-cmake (>= 5.3.0),
                python3-dev,
                libsqlite3-dev,
-               librocrand-dev,
+               libhiprand-dev,
                libboost-program-options-dev,
                libfftw3-dev,
-               libgtest-dev,
+               libgtest-dev <!nocheck>,
 Build-Depends-Indep: dh-sequence-sphinxdoc <!nodoc>,
                doxygen <!nodoc>,
                python3-breathe <!nodoc>,
@@ -67,6 +68,7 @@
 Package: librocfft0-tests
 Section: libdevel
 Architecture: amd64 arm64 ppc64el
+Build-Profiles: <!nocheck>
 Depends: librocfft0 (= ${binary:Version}),${misc:Depends}, ${shlibs:Depends},
 Description: ROCm library for computing Fast Fourier Transforms - tests
  rocFFT is a library for computing the discrete Fourier transform. It is
diff -Nru rocfft-5.5.0/debian/copyright rocfft-5.7.1/debian/copyright
--- rocfft-5.5.0/debian/copyright	2023-11-10 09:02:29.000000000 +0000
+++ rocfft-5.7.1/debian/copyright	2024-03-12 17:13:18.000000000 +0000
@@ -9,6 +9,7 @@
 Files: debian/*
 Copyright: 2022, Maxime Chambonnet <maxzor@maxzor.eu>
            2022-2023, Cordell Bloor <cgmb@slerp.xyz>
+           2024, Christian Kastner <ckk@debian.org>
 License: Expat
 
 License: Expat
diff -Nru rocfft-5.5.0/debian/librocfft0.install rocfft-5.7.1/debian/librocfft0.install
--- rocfft-5.5.0/debian/librocfft0.install	2023-11-10 09:02:29.000000000 +0000
+++ rocfft-5.7.1/debian/librocfft0.install	2024-03-12 17:13:18.000000000 +0000
@@ -1,2 +1,2 @@
 usr/lib/*/librocfft.so.*
-usr/lib/*/rocfft/1.0.21/rocfft_rtc_helper
+usr/lib/*/rocfft/1.0.23/rocfft_rtc_helper
diff -Nru rocfft-5.5.0/debian/librocfft0.symbols.amd64 rocfft-5.7.1/debian/librocfft0.symbols.amd64
--- rocfft-5.5.0/debian/librocfft0.symbols.amd64	2023-11-10 09:02:29.000000000 +0000
+++ rocfft-5.7.1/debian/librocfft0.symbols.amd64	2024-03-12 17:13:18.000000000 +0000
@@ -1,96 +1,215 @@
 librocfft.so.0 librocfft0 #MINVER#
 * Build-Depends-Package: librocfft-dev
- (optional)_Z24GenerateHalfNTableKernelI15HIP_vector_typeIdLj2EEEvmmPT_@Base 5.5.0
- (optional)_Z24GenerateHalfNTableKernelI15HIP_vector_typeIfLj2EEEvmmPT_@Base 5.5.0
- (optional)_Z26GenerateTwiddleTableKernelI15HIP_vector_typeIdLj2EEEvmm9radices_tS2_S2_PT_@Base 5.5.0
- (optional)_Z26GenerateTwiddleTableKernelI15HIP_vector_typeIdLj2EEEvmmPT_@Base 5.5.0
- (optional)_Z26GenerateTwiddleTableKernelI15HIP_vector_typeIfLj2EEEvmm9radices_tS2_S2_PT_@Base 5.5.0
- (optional)_Z26GenerateTwiddleTableKernelI15HIP_vector_typeIfLj2EEEvmmPT_@Base 5.5.0
- (optional)_Z31GenerateTwiddleTableLargeKernelI15HIP_vector_typeIdLj2EEEvdmmmPT_@Base 5.5.0
- (optional)_Z31GenerateTwiddleTableLargeKernelI15HIP_vector_typeIfLj2EEEvdmmmPT_@Base 5.5.0
- (optional)_ZN13function_poolC1Ev@Base 5.5.0
- (optional)_ZN13function_poolC2Ev@Base 5.5.0
+ (optional)_ZGVZNKSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0ELb0EEclEcE5__nul@Base 5.7.1
+ (optional)_ZGVZNKSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0ELb1EEclEcE5__nul@Base 5.7.1
+ (optional)_ZGVZNKSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1ELb0EEclEcE5__nul@Base 5.7.1
+ (optional)_ZGVZNKSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1ELb1EEclEcE5__nul@Base 5.7.1
  (optional)_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z@Base 5.5.0
- (optional)_ZNSt10_HashtableISt5tupleIJSt5arrayImLm2EE18rocfft_precision_e13ComputeScheme19SBRC_TRANSPOSE_TYPEEESt4pairIKS6_9FFTKernelESaISA_ENSt8__detail10_Select1stESt8equal_toIS6_E10SimpleHashNSC_18_Mod_range_hashingENSC_20_Default_ranged_hashENSC_20_Prime_rehash_policyENSC_17_Hashtable_traitsILb0ELb0ELb1EEEE10_M_emplaceIJS6_S9_EEES7_INSC_14_Node_iteratorISA_Lb0ELb0EEEbESt17integral_constantIbLb1EEDpOT_@Base 5.5.0
- (optional)_ZNSt10_HashtableISt5tupleIJSt5arrayImLm2EE18rocfft_precision_e13ComputeScheme19SBRC_TRANSPOSE_TYPEEESt4pairIKS6_9FFTKernelESaISA_ENSt8__detail10_Select1stESt8equal_toIS6_E10SimpleHashNSC_18_Mod_range_hashingENSC_20_Default_ranged_hashENSC_20_Prime_rehash_policyENSC_17_Hashtable_traitsILb0ELb0ELb1EEEE12_Scoped_nodeD2Ev@Base 5.5.0
- (optional)_ZNSt10_HashtableISt5tupleIJSt5arrayImLm2EE18rocfft_precision_e13ComputeScheme19SBRC_TRANSPOSE_TYPEEESt4pairIKS6_9FFTKernelESaISA_ENSt8__detail10_Select1stESt8equal_toIS6_E10SimpleHashNSC_18_Mod_range_hashingENSC_20_Default_ranged_hashENSC_20_Prime_rehash_policyENSC_17_Hashtable_traitsILb0ELb0ELb1EEEE13_M_rehash_auxEmSt17integral_constantIbLb1EE@Base 5.5.0
- (optional)_ZNSt10_HashtableISt5tupleIJSt5arrayImLm2EE18rocfft_precision_e13ComputeScheme19SBRC_TRANSPOSE_TYPEEESt4pairIKS6_9FFTKernelESaISA_ENSt8__detail10_Select1stESt8equal_toIS6_E10SimpleHashNSC_18_Mod_range_hashingENSC_20_Default_ranged_hashENSC_20_Prime_rehash_policyENSC_17_Hashtable_traitsILb0ELb0ELb1EEEE21_M_insert_unique_nodeEmmPNSC_10_Hash_nodeISA_Lb0EEEm@Base 5.5.0
+ (optional)_ZNKSt7__cxx1112regex_traitsIcE16lookup_classnameIPKcEENS1_10_RegexMaskET_S6_b@Base 5.7.1
+ (optional)_ZNKSt7__cxx1112regex_traitsIcE18lookup_collatenameIPKcEENS_12basic_stringIcSt11char_traitsIcESaIcEEET_SA_@Base 5.7.1
+ (optional)_ZNKSt7__cxx1112regex_traitsIcE5valueEci@Base 5.7.1
+ (optional)_ZNKSt7__cxx1114regex_iteratorIN9__gnu_cxx17__normal_iteratorIPKcNS_12basic_stringIcSt11char_traitsIcESaIcEEEEEcNS_12regex_traitsIcEEEeqERKSD_@Base 5.7.1
+ (optional)_ZNKSt7__cxx1120regex_token_iteratorIN9__gnu_cxx17__normal_iteratorIPKcNS_12basic_stringIcSt11char_traitsIcESaIcEEEEEcNS_12regex_traitsIcEEEeqERKSD_@Base 5.7.1
+ (optional)_ZNKSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb0EE16_M_word_boundaryEv@Base 5.7.1
+ (optional)_ZNKSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb1EE16_M_word_boundaryEv@Base 5.7.1
  (optional)_ZNSt10_HashtableImSt4pairIKmPKcESaIS4_ENSt8__detail10_Select1stESt8equal_toImESt4hashImENS6_18_Mod_range_hashingENS6_20_Default_ranged_hashENS6_20_Prime_rehash_policyENS6_17_Hashtable_traitsILb0ELb0ELb1EEEE13_M_rehash_auxEmSt17integral_constantIbLb1EE@Base 5.5.0
  (optional)_ZNSt10_HashtableImSt4pairIKmPKcESaIS4_ENSt8__detail10_Select1stESt8equal_toImESt4hashImENS6_18_Mod_range_hashingENS6_20_Default_ranged_hashENS6_20_Prime_rehash_policyENS6_17_Hashtable_traitsILb0ELb0ELb1EEEE16_M_insert_uniqueIRS1_RKS4_NS6_10_AllocNodeISaINS6_10_Hash_nodeIS4_Lb0EEEEEEEES0_INS6_14_Node_iteratorIS4_Lb0ELb0EEEbEOT_OT0_RKT1_@Base 5.5.0
  (optional)_ZNSt10_HashtableImSt4pairIKmPKcESaIS4_ENSt8__detail10_Select1stESt8equal_toImESt4hashImENS6_18_Mod_range_hashingENS6_20_Default_ranged_hashENS6_20_Prime_rehash_policyENS6_17_Hashtable_traitsILb0ELb0ELb1EEEE21_M_insert_unique_nodeEmmPNS6_10_Hash_nodeIS4_Lb0EEEm@Base 5.5.0
  (optional)_ZNSt10_HashtableImSt4pairIKmPKcESaIS4_ENSt8__detail10_Select1stESt8equal_toImESt4hashImENS6_18_Mod_range_hashingENS6_20_Default_ranged_hashENS6_20_Prime_rehash_policyENS6_17_Hashtable_traitsILb0ELb0ELb1EEEEC2IPKS4_EET_SL_mRKSB_RKS9_RKS5_St17integral_constantIbLb1EE@Base 5.5.0
  (optional)_ZNSt10_HashtableImSt4pairIKmPKcESaIS4_ENSt8__detail10_Select1stESt8equal_toImESt4hashImENS6_18_Mod_range_hashingENS6_20_Default_ranged_hashENS6_20_Prime_rehash_policyENS6_17_Hashtable_traitsILb0ELb0ELb1EEEED2Ev@Base 5.5.0
  (optional)_ZNSt10filesystem7__cxx11dvERKNS0_4pathES3_@Base 5.5.0
- (optional)_ZNSt13unordered_mapISt5tupleIJSt5arrayImLm2EE18rocfft_precision_e13ComputeScheme19SBRC_TRANSPOSE_TYPEEE9FFTKernel10SimpleHashSt8equal_toIS6_ESaISt4pairIKS6_S7_EEED2Ev@Base 5.5.0
+ (optional)_ZNSt11_Deque_baseINSt8__detail9_StateSeqINSt7__cxx1112regex_traitsIcEEEESaIS5_EE17_M_initialize_mapEm@Base 5.7.1
+ (optional)_ZNSt11_Deque_baseIlSaIlEE17_M_initialize_mapEm@Base 5.7.1
  (optional)_ZNSt15__exception_ptr12__dest_thunkISt12future_errorEEvPv@Base 5.5.0
+ (optional)_ZNSt5dequeINSt8__detail9_StateSeqINSt7__cxx1112regex_traitsIcEEEESaIS5_EE16_M_push_back_auxIJRKS5_EEEvDpOT_@Base 5.7.1
+ (optional)_ZNSt5dequeINSt8__detail9_StateSeqINSt7__cxx1112regex_traitsIcEEEESaIS5_EE16_M_push_back_auxIJS5_EEEvDpOT_@Base 5.7.1
+ (optional)_ZNSt5dequeINSt8__detail9_StateSeqINSt7__cxx1112regex_traitsIcEEEESaIS5_EE17_M_reallocate_mapEmb@Base 5.7.1
+ (optional)_ZNSt5dequeIlSaIlEE16_M_push_back_auxIJRKlEEEvDpOT_@Base 5.7.1
+ (optional)_ZNSt5dequeIlSaIlEE17_M_reallocate_mapEmb@Base 5.7.1
  (optional)_ZNSt6vectorINSt10filesystem7__cxx114pathESaIS2_EE17_M_realloc_insertIJRKS2_EEEvN9__gnu_cxx17__normal_iteratorIPS2_S4_EEDpOT_@Base 5.5.0
  (optional)_ZNSt6vectorINSt10filesystem7__cxx114pathESaIS2_EE17_M_realloc_insertIJS2_EEEvN9__gnu_cxx17__normal_iteratorIPS2_S4_EEDpOT_@Base 5.5.0
+ (optional)_ZNSt6vectorINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaIS5_EE17_M_default_appendEm@Base 5.7.1
+ (optional)_ZNSt6vectorINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaIS5_EE17_M_realloc_insertIJRKS5_EEEvN9__gnu_cxx17__normal_iteratorIPS5_S7_EEDpOT_@Base 5.7.1
  (optional)_ZNSt6vectorINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaIS5_EE17_M_realloc_insertIJS5_EEEvN9__gnu_cxx17__normal_iteratorIPS5_S7_EEDpOT_@Base 5.5.0
  (optional)_ZNSt6vectorINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaIS5_EEaSERKS7_@Base 5.5.0
- (optional)_ZNSt6vectorIS_IcSaIcEESaIS1_EE17_M_default_appendEm@Base 5.5.0
+ (optional)_ZNSt6vectorINSt7__cxx119sub_matchIN9__gnu_cxx17__normal_iteratorIPKcNS0_12basic_stringIcSt11char_traitsIcESaIcEEEEEEESaISC_EE14_M_fill_assignEmRKSC_@Base 5.7.1
+ (optional)_ZNSt6vectorINSt7__cxx119sub_matchIN9__gnu_cxx17__normal_iteratorIPKcNS0_12basic_stringIcSt11char_traitsIcESaIcEEEEEEESaISC_EEaSERKSE_@Base 5.7.1
+ (optional)_ZNSt6vectorINSt8__detail6_StateIcEESaIS2_EE17_M_realloc_insertIJS2_EEEvN9__gnu_cxx17__normal_iteratorIPS2_S4_EEDpOT_@Base 5.7.1
+ (optional)_ZNSt6vectorIS_ImSaImEESaIS1_EE17_M_realloc_insertIJRKS1_EEEvN9__gnu_cxx17__normal_iteratorIPS1_S3_EEDpOT_@Base 5.7.1
+ (optional)_ZNSt6vectorISt3setINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESt4lessIS6_ESaIS6_EESaISA_EE17_M_default_appendEm@Base 5.7.1
+ (optional)_ZNSt6vectorISt4pairINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES6_ESaIS7_EE17_M_realloc_insertIJS7_EEEvN9__gnu_cxx17__normal_iteratorIPS7_S9_EEDpOT_@Base 5.7.1
+ (optional)_ZNSt6vectorISt4pairIlS_INSt7__cxx119sub_matchIN9__gnu_cxx17__normal_iteratorIPKcNS1_12basic_stringIcSt11char_traitsIcESaIcEEEEEEESaISD_EEESaISG_EE17_M_realloc_insertIJRlRKSF_EEEvNS4_IPSG_SI_EEDpOT_@Base 5.7.1
+ (optional)_ZNSt6vectorIbSaIbEE14_M_fill_insertESt13_Bit_iteratormb@Base 5.7.1
  (optional)_ZNSt6vectorIcSaIcEE17_M_default_appendEm@Base 5.5.0
+ (optional)_ZNSt6vectorIdSaIdEE17_M_default_appendEm@Base 5.7.1
+ (optional)_ZNSt6vectorIiSaIiEE17_M_default_appendEm@Base 5.7.1
+ (optional)_ZNSt6vectorIiSaIiEEaSERKS1_@Base 5.7.1
  (optional)_ZNSt6vectorIjSaIjEE15_M_range_insertIN9__gnu_cxx17__normal_iteratorIPjS1_EEEEvS6_T_S7_St20forward_iterator_tag@Base 5.5.0
+ (optional)_ZNSt6vectorIjSaIjEEaSERKS1_@Base 5.7.1
  (optional)_ZNSt6vectorImSaImEE13_M_assign_auxIPKmEEvT_S5_St20forward_iterator_tag@Base 5.5.0
  (optional)_ZNSt6vectorImSaImEE15_M_range_insertIN9__gnu_cxx17__normal_iteratorIPKmS1_EEEEvNS4_IPmS1_EET_SA_St20forward_iterator_tag@Base 5.5.0
+ (optional)_ZNSt6vectorImSaImEE15_M_range_insertIN9__gnu_cxx17__normal_iteratorIPmS1_EEEEvS6_T_S7_St20forward_iterator_tag@Base 5.7.1
  (optional)_ZNSt6vectorImSaImEE17_M_default_appendEm@Base 5.5.0
  (optional)_ZNSt6vectorImSaImEEaSERKS1_@Base 5.5.0
+ (optional)_ZNSt7__cxx1114regex_iteratorIN9__gnu_cxx17__normal_iteratorIPKcNS_12basic_stringIcSt11char_traitsIcESaIcEEEEEcNS_12regex_traitsIcEEEppEv@Base 5.7.1
+ (optional)_ZNSt7__cxx1120regex_token_iteratorIN9__gnu_cxx17__normal_iteratorIPKcNS_12basic_stringIcSt11char_traitsIcESaIcEEEEEcNS_12regex_traitsIcEEE7_M_initESA_SA_@Base 5.7.1
+ (optional)_ZNSt7__cxx1120regex_token_iteratorIN9__gnu_cxx17__normal_iteratorIPKcNS_12basic_stringIcSt11char_traitsIcESaIcEEEEEcNS_12regex_traitsIcEEEaSERKSD_@Base 5.7.1
+ (optional)_ZNSt7__cxx1120regex_token_iteratorIN9__gnu_cxx17__normal_iteratorIPKcNS_12basic_stringIcSt11char_traitsIcESaIcEEEEEcNS_12regex_traitsIcEEEppEv@Base 5.7.1
  (optional)_ZNSt7__cxx119to_stringEi@Base 5.5.0
+ (optional)_ZNSt7__cxx119to_stringEj@Base 5.7.1
  (optional)_ZNSt7__cxx119to_stringEm@Base 5.5.0
  (optional)_ZNSt8_Rb_treeINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES5_St9_IdentityIS5_ESt4lessIS5_ESaIS5_EE16_M_insert_uniqueIRKS5_EESt4pairISt17_Rb_tree_iteratorIS5_EbEOT_@Base 5.5.0
  (optional)_ZNSt8_Rb_treeINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES5_St9_IdentityIS5_ESt4lessIS5_ESaIS5_EE24_M_get_insert_unique_posERKS5_@Base 5.5.0
  (optional)_ZNSt8_Rb_treeINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES5_St9_IdentityIS5_ESt4lessIS5_ESaIS5_EE8_M_eraseEPSt13_Rb_tree_nodeIS5_E@Base 5.5.0
+ (optional)_ZNSt8_Rb_treeISt6vectorImSaImEES2_St9_IdentityIS2_ESt4lessIS2_ESaIS2_EE16_M_insert_uniqueIRKS2_EESt4pairISt17_Rb_tree_iteratorIS2_EbEOT_@Base 5.7.1
+ (optional)_ZNSt8_Rb_treeISt6vectorImSaImEES2_St9_IdentityIS2_ESt4lessIS2_ESaIS2_EE16_M_insert_uniqueIS2_EESt4pairISt17_Rb_tree_iteratorIS2_EbEOT_@Base 5.7.1
+ (optional)_ZNSt8_Rb_treeISt6vectorImSaImEES2_St9_IdentityIS2_ESt4lessIS2_ESaIS2_EE24_M_get_insert_unique_posERKS2_@Base 5.7.1
+ (optional)_ZNSt8_Rb_treeISt6vectorImSaImEES2_St9_IdentityIS2_ESt4lessIS2_ESaIS2_EE29_M_get_insert_hint_unique_posESt23_Rb_tree_const_iteratorIS2_ERKS2_@Base 5.7.1
+ (optional)_ZNSt8_Rb_treeISt6vectorImSaImEES2_St9_IdentityIS2_ESt4lessIS2_ESaIS2_EE8_M_eraseEPSt13_Rb_tree_nodeIS2_E@Base 5.7.1
+ (optional)_ZNSt8_Rb_treeIlSt4pairIKllESt10_Select1stIS2_ESt4lessIlESaIS2_EE29_M_get_insert_hint_unique_posESt23_Rb_tree_const_iteratorIS2_ERS1_@Base 5.7.1
+ (optional)_ZNSt8_Rb_treeIlSt4pairIKllESt10_Select1stIS2_ESt4lessIlESaIS2_EE8_M_eraseEPSt13_Rb_tree_nodeIS2_E@Base 5.7.1
  (optional)_ZNSt8_Rb_treeImSt4pairIKmmESt10_Select1stIS2_ESt4lessImESaIS2_EE8_M_eraseEPSt13_Rb_tree_nodeIS2_E@Base 5.5.0
+ (optional)_ZNSt8_Rb_treeImmSt9_IdentityImESt4lessImESaImEE5eraseERKm@Base 5.7.1
  (optional)_ZNSt8_Rb_treeImmSt9_IdentityImESt4lessImESaImEE7_M_copyILb0ENS5_11_Alloc_nodeEEEPSt13_Rb_tree_nodeImESA_PSt18_Rb_tree_node_baseRT0_@Base 5.5.0
  (optional)_ZNSt8_Rb_treeImmSt9_IdentityImESt4lessImESaImEE8_M_eraseEPSt13_Rb_tree_nodeImE@Base 5.5.0
- (optional)_ZNSt8__detail16_Hashtable_allocISaINS_10_Hash_nodeISt4pairIKSt5tupleIJSt5arrayImLm2EE18rocfft_precision_e13ComputeScheme19SBRC_TRANSPOSE_TYPEEE9FFTKernelELb0EEEEE16_M_allocate_nodeIJS9_SB_EEEPSD_DpOT_@Base 5.5.0
- (optional)_ZSt11__make_heapIN9__gnu_cxx17__normal_iteratorIPjSt6vectorIjSaIjEEEENS0_5__ops15_Iter_less_iterEEvT_S9_RT0_@Base 5.5.0
- (optional)_ZSt11__make_heapIN9__gnu_cxx17__normal_iteratorIPmSt6vectorImSaImEEEENS0_5__ops15_Iter_comp_iterISt7greaterImEEEEvT_SC_RT0_@Base 5.5.0
+ (optional)_ZNSt8__detail17__regex_algo_implIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEEcNS5_12regex_traitsIcEEEEbT_SH_RNS5_13match_resultsISH_T0_EERKNS5_11basic_regexIT1_T2_EENSt15regex_constants15match_flag_typeENS_20_RegexExecutorPolicyEb@Base 5.7.1
+ (optional)_ZNSt8__detail4_NFAINSt7__cxx1112regex_traitsIcEEE17_M_insert_backrefEm@Base 5.7.1
+ (optional)_ZNSt8__detail8_ScannerIcE12_M_eat_classEc@Base 5.7.1
+ (optional)_ZNSt8__detail8_ScannerIcE14_M_scan_normalEv@Base 5.7.1
+ (optional)_ZNSt8__detail8_ScannerIcE16_M_scan_in_braceEv@Base 5.7.1
+ (optional)_ZNSt8__detail8_ScannerIcE17_M_eat_escape_awkEv@Base 5.7.1
+ (optional)_ZNSt8__detail8_ScannerIcE18_M_eat_escape_ecmaEv@Base 5.7.1
+ (optional)_ZNSt8__detail8_ScannerIcE18_M_scan_in_bracketEv@Base 5.7.1
+ (optional)_ZNSt8__detail8_ScannerIcE19_M_eat_escape_posixEv@Base 5.7.1
+ (optional)_ZNSt8__detail8_ScannerIcEC2EPKcS3_NSt15regex_constants18syntax_option_typeESt6locale@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE11_M_try_charEv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE12_M_assertionEv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE13_M_quantifierEv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE14_M_alternativeEv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE14_M_disjunctionEv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE14_M_match_tokenENS_12_ScannerBase7_TokenTE@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE16_M_cur_int_valueEi@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE18_M_expression_termILb0ELb0EEEbRNS4_13_BracketStateERNS_15_BracketMatcherIS3_XT_EXT0_EEE@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE18_M_expression_termILb0ELb1EEEbRNS4_13_BracketStateERNS_15_BracketMatcherIS3_XT_EXT0_EEE@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE18_M_expression_termILb1ELb0EEEbRNS4_13_BracketStateERNS_15_BracketMatcherIS3_XT_EXT0_EEE@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE18_M_expression_termILb1ELb1EEEbRNS4_13_BracketStateERNS_15_BracketMatcherIS3_XT_EXT0_EEE@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE21_M_bracket_expressionEv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE22_M_insert_char_matcherILb0ELb0EEEvv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE22_M_insert_char_matcherILb0ELb1EEEvv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE22_M_insert_char_matcherILb1ELb0EEEvv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE22_M_insert_char_matcherILb1ELb1EEEvv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE25_M_insert_bracket_matcherILb0ELb0EEEvb@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE25_M_insert_bracket_matcherILb0ELb1EEEvb@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE25_M_insert_bracket_matcherILb1ELb0EEEvb@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE25_M_insert_bracket_matcherILb1ELb1EEEvb@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE26_M_insert_any_matcher_ecmaILb0ELb0EEEvv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE26_M_insert_any_matcher_ecmaILb0ELb1EEEvv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE26_M_insert_any_matcher_ecmaILb1ELb0EEEvv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE26_M_insert_any_matcher_ecmaILb1ELb1EEEvv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE27_M_insert_any_matcher_posixILb0ELb0EEEvv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE27_M_insert_any_matcher_posixILb0ELb1EEEvv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE27_M_insert_any_matcher_posixILb1ELb0EEEvv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE27_M_insert_any_matcher_posixILb1ELb1EEEvv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE33_M_insert_character_class_matcherILb0ELb0EEEvv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE33_M_insert_character_class_matcherILb0ELb1EEEvv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE33_M_insert_character_class_matcherILb1ELb0EEEvv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE33_M_insert_character_class_matcherILb1ELb1EEEvv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE7_M_atomEv@Base 5.7.1
+ (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEEC2EPKcS6_RKSt6localeNSt15regex_constants18syntax_option_typeE@Base 5.7.1
+ (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb0EE12_M_lookaheadEl@Base 5.7.1
+ (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb0EE15_M_handle_matchENSH_11_Match_modeEl@Base 5.7.1
+ (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb0EE16_M_main_dispatchENSH_11_Match_modeESt17integral_constantIbLb0EE@Base 5.7.1
+ (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb0EE16_M_rep_once_moreENSH_11_Match_modeEl@Base 5.7.1
+ (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb0EE17_M_handle_backrefENSH_11_Match_modeEl@Base 5.7.1
+ (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb0EE6_M_dfsENSH_11_Match_modeEl@Base 5.7.1
+ (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb1EE12_M_lookaheadEl@Base 5.7.1
+ (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb1EE17_M_handle_backrefENSH_11_Match_modeEl@Base 5.7.1
+ (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb1EE6_M_dfsENSH_11_Match_modeEl@Base 5.7.1
+ (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb1EE9_M_searchEv@Base 5.7.1
+ (optional)_ZNSt8__detail9_StateSeqINSt7__cxx1112regex_traitsIcEEE8_M_cloneEv@Base 5.7.1
  (optional)_ZSt16__do_uninit_copyIN9__gnu_cxx17__normal_iteratorIPKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESt6vectorIS7_SaIS7_EEEEPS7_ET0_T_SG_SF_@Base 5.5.0
  (optional)_ZSt16__do_uninit_copyIPKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPS5_ET0_T_SA_S9_@Base 5.5.0
  (optional)_ZSt16__do_uninit_copyIPNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES6_ET0_T_S8_S7_@Base 5.5.0
- (optional)_ZSt16__introsort_loopIN9__gnu_cxx17__normal_iteratorIPjSt6vectorIjSaIjEEEElNS0_5__ops15_Iter_less_iterEEvT_S9_T0_T1_@Base 5.5.0
- (optional)_ZSt16__introsort_loopIN9__gnu_cxx17__normal_iteratorIPmSt6vectorImSaImEEEElNS0_5__ops15_Iter_comp_iterISt7greaterImEEEEvT_SC_T0_T1_@Base 5.5.0
+ (optional)_ZSt19__throw_regex_errorNSt15regex_constants10error_typeEPKc@Base 5.7.1
  (optional)_ZSt19piecewise_construct@Base 5.5.0
- (optional)_ZSt20__throw_bad_any_castv@Base 5.5.0
- (optional)_ZSt22__final_insertion_sortIN9__gnu_cxx17__normal_iteratorIPjSt6vectorIjSaIjEEEENS0_5__ops15_Iter_less_iterEEvT_S9_T0_@Base 5.5.0
- (optional)_ZSt22__final_insertion_sortIN9__gnu_cxx17__normal_iteratorIPmSt6vectorImSaImEEEENS0_5__ops15_Iter_comp_iterISt7greaterImEEEEvT_SC_T0_@Base 5.5.0
  (optional)_ZSt27__throw_bad_optional_accessv@Base 5.5.0
+ (optional)_ZSt4sortIN9__gnu_cxx17__normal_iteratorIPmSt6vectorImSaImEEEEEvT_S7_@Base 5.7.1
  (optional)_ZSt7find_ifIN9__gnu_cxx17__normal_iteratorIPKmSt6vectorImSaImEEEESt8functionIFbmEEET_SB_SB_T0_@Base 5.5.0
- (optional)_ZSt9__find_ifIN9__gnu_cxx17__normal_iteratorIPKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESt6vectorIS7_SaIS7_EEEENS0_5__ops16_Iter_equals_valIS8_EEET_SH_SH_T0_St26random_access_iterator_tag@Base 5.5.0
- (optional)_ZSt9__find_ifIN9__gnu_cxx17__normal_iteratorIPKmSt6vectorImSaImEEEENS0_5__ops10_Iter_predISt8functionIFbmEEEEET_SE_SE_T0_St26random_access_iterator_tag@Base 5.5.0
  (optional)_ZStneRKSt10error_codeRKSt15error_condition@Base 5.5.0
  (optional)_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EEOS8_PKS5_@Base 5.5.0
+ (optional)_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EEOS8_RKS8_@Base 5.7.1
  (optional)_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EEOS8_S9_@Base 5.5.0
  (optional)_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EEPKS5_OS8_@Base 5.5.0
  (optional)_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EEPKS5_RKS8_@Base 5.5.0
+ (optional)_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_PKS5_@Base 5.7.1
  (optional)_ZTINSt13__future_base13_State_baseV27_SetterIvvEE@Base 5.5.0
  (optional)_ZTINSt13__future_base13_State_baseV2E@Base 5.5.0
  (optional)_ZTINSt13__future_base21_Async_state_commonV2E@Base 5.5.0
  (optional)_ZTINSt13__future_base7_ResultIvEE@Base 5.5.0
+ (optional)_ZTINSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0ELb0EEE@Base 5.7.1
+ (optional)_ZTINSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0ELb1EEE@Base 5.7.1
+ (optional)_ZTINSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1ELb0EEE@Base 5.7.1
+ (optional)_ZTINSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1ELb1EEE@Base 5.7.1
+ (optional)_ZTINSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb1ELb0ELb0EEE@Base 5.7.1
+ (optional)_ZTINSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb1ELb0ELb1EEE@Base 5.7.1
+ (optional)_ZTINSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb1ELb1ELb0EEE@Base 5.7.1
+ (optional)_ZTINSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb1ELb1ELb1EEE@Base 5.7.1
+ (optional)_ZTINSt8__detail12_CharMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0EEE@Base 5.7.1
+ (optional)_ZTINSt8__detail12_CharMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1EEE@Base 5.7.1
+ (optional)_ZTINSt8__detail12_CharMatcherINSt7__cxx1112regex_traitsIcEELb1ELb0EEE@Base 5.7.1
+ (optional)_ZTINSt8__detail12_CharMatcherINSt7__cxx1112regex_traitsIcEELb1ELb1EEE@Base 5.7.1
+ (optional)_ZTINSt8__detail15_BracketMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0EEE@Base 5.7.1
+ (optional)_ZTINSt8__detail15_BracketMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1EEE@Base 5.7.1
+ (optional)_ZTINSt8__detail15_BracketMatcherINSt7__cxx1112regex_traitsIcEELb1ELb0EEE@Base 5.7.1
+ (optional)_ZTINSt8__detail15_BracketMatcherINSt7__cxx1112regex_traitsIcEELb1ELb1EEE@Base 5.7.1
  (optional)_ZTISt11_Mutex_baseILN9__gnu_cxx12_Lock_policyE2EE@Base 5.5.0
- (optional)_ZTISt12bad_any_cast@Base 5.5.0
  (optional)_ZTISt16_Sp_counted_baseILN9__gnu_cxx12_Lock_policyE2EE@Base 5.5.0
  (optional)_ZTISt18bad_variant_access@Base 5.5.0
  (optional)_ZTISt19bad_optional_access@Base 5.5.0
  (optional)_ZTISt23_Sp_counted_ptr_inplaceINSt13__future_base13_State_baseV2ESaIvELN9__gnu_cxx12_Lock_policyE2EE@Base 5.5.0
+ (optional)_ZTISt23_Sp_counted_ptr_inplaceINSt8__detail4_NFAINSt7__cxx1112regex_traitsIcEEEESaIvELN9__gnu_cxx12_Lock_policyE2EE@Base 5.7.1
  (optional)_ZTSNSt13__future_base13_State_baseV27_SetterIvvEE@Base 5.5.0
  (optional)_ZTSNSt13__future_base13_State_baseV2E@Base 5.5.0
  (optional)_ZTSNSt13__future_base21_Async_state_commonV2E@Base 5.5.0
  (optional)_ZTSNSt13__future_base7_ResultIvEE@Base 5.5.0
+ (optional)_ZTSNSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0ELb0EEE@Base 5.7.1
+ (optional)_ZTSNSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0ELb1EEE@Base 5.7.1
+ (optional)_ZTSNSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1ELb0EEE@Base 5.7.1
+ (optional)_ZTSNSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1ELb1EEE@Base 5.7.1
+ (optional)_ZTSNSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb1ELb0ELb0EEE@Base 5.7.1
+ (optional)_ZTSNSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb1ELb0ELb1EEE@Base 5.7.1
+ (optional)_ZTSNSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb1ELb1ELb0EEE@Base 5.7.1
+ (optional)_ZTSNSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb1ELb1ELb1EEE@Base 5.7.1
+ (optional)_ZTSNSt8__detail12_CharMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0EEE@Base 5.7.1
+ (optional)_ZTSNSt8__detail12_CharMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1EEE@Base 5.7.1
+ (optional)_ZTSNSt8__detail12_CharMatcherINSt7__cxx1112regex_traitsIcEELb1ELb0EEE@Base 5.7.1
+ (optional)_ZTSNSt8__detail12_CharMatcherINSt7__cxx1112regex_traitsIcEELb1ELb1EEE@Base 5.7.1
+ (optional)_ZTSNSt8__detail15_BracketMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0EEE@Base 5.7.1
+ (optional)_ZTSNSt8__detail15_BracketMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1EEE@Base 5.7.1
+ (optional)_ZTSNSt8__detail15_BracketMatcherINSt7__cxx1112regex_traitsIcEELb1ELb0EEE@Base 5.7.1
+ (optional)_ZTSNSt8__detail15_BracketMatcherINSt7__cxx1112regex_traitsIcEELb1ELb1EEE@Base 5.7.1
  (optional)_ZTSSt11_Mutex_baseILN9__gnu_cxx12_Lock_policyE2EE@Base 5.5.0
- (optional)_ZTSSt12bad_any_cast@Base 5.5.0
  (optional)_ZTSSt16_Sp_counted_baseILN9__gnu_cxx12_Lock_policyE2EE@Base 5.5.0
  (optional)_ZTSSt18bad_variant_access@Base 5.5.0
  (optional)_ZTSSt19_Sp_make_shared_tag@Base 5.5.0
  (optional)_ZTSSt19bad_optional_access@Base 5.5.0
  (optional)_ZTSSt23_Sp_counted_ptr_inplaceINSt13__future_base13_State_baseV2ESaIvELN9__gnu_cxx12_Lock_policyE2EE@Base 5.5.0
+ (optional)_ZTSSt23_Sp_counted_ptr_inplaceINSt8__detail4_NFAINSt7__cxx1112regex_traitsIcEEEESaIvELN9__gnu_cxx12_Lock_policyE2EE@Base 5.7.1
  (optional)_ZTVNSt13__future_base13_State_baseV2E@Base 5.5.0
  (optional)_ZTVNSt13__future_base21_Async_state_commonV2E@Base 5.5.0
  (optional)_ZTVNSt13__future_base7_ResultIvEE@Base 5.5.0
- (optional)_ZTVSt12bad_any_cast@Base 5.5.0
  (optional)_ZTVSt18bad_variant_access@Base 5.5.0
  (optional)_ZTVSt19bad_optional_access@Base 5.5.0
  (optional)_ZTVSt23_Sp_counted_ptr_inplaceINSt13__future_base13_State_baseV2ESaIvELN9__gnu_cxx12_Lock_policyE2EE@Base 5.5.0
+ (optional)_ZTVSt23_Sp_counted_ptr_inplaceINSt8__detail4_NFAINSt7__cxx1112regex_traitsIcEEEESaIvELN9__gnu_cxx12_Lock_policyE2EE@Base 5.7.1
+ (optional)_ZZNKSt7__cxx1112regex_traitsIcE16lookup_classnameIPKcEENS1_10_RegexMaskET_S6_bE12__classnamesB5cxx11@Base 5.7.1
+ (optional)_ZZNKSt7__cxx1112regex_traitsIcE18lookup_collatenameIPKcEENS_12basic_stringIcSt11char_traitsIcESaIcEEET_SA_E14__collatenames@Base 5.7.1
+ (optional)_ZZNKSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0ELb0EEclEcE5__nul@Base 5.7.1
+ (optional)_ZZNKSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0ELb1EEclEcE5__nul@Base 5.7.1
+ (optional)_ZZNKSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1ELb0EEclEcE5__nul@Base 5.7.1
+ (optional)_ZZNKSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1ELb1EEclEcE5__nul@Base 5.7.1
+ (optional)_ZZNKSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb0EE10_M_is_wordEcE3__s@Base 5.7.1
+ (optional)_ZZNKSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb1EE10_M_is_wordEcE3__s@Base 5.7.1
  (optional)_ZZNSt19_Sp_make_shared_tag5_S_tiEvE5__tag@Base 5.5.0
  rocfft_cache_buffer_free@Base 5.5.0
  rocfft_cache_deserialize@Base 5.5.0
diff -Nru rocfft-5.5.0/debian/patches/0001-remove-use-of-openmp.patch rocfft-5.7.1/debian/patches/0001-remove-use-of-openmp.patch
--- rocfft-5.5.0/debian/patches/0001-remove-use-of-openmp.patch	2023-11-10 09:02:29.000000000 +0000
+++ rocfft-5.7.1/debian/patches/0001-remove-use-of-openmp.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,25 +0,0 @@
-From: Cordell Bloor <cgmb@slerp.xyz>
-Date: Sun, 27 Nov 2022 00:34:47 -0700
-Subject: remove use of openmp
-
-It's not clear how OpenMP should be used with Clang. It appears that
-in ROCm, the OpenMP support is provided by openmp-extras (which comes
-from the AOMP build of LLVM).
-
-Fixed upstream in e7b1fe244ab0623e900b4efe75be29545df1163b.
----
- clients/fft_params.h | 1 -
- 1 file changed, 1 deletion(-)
-
-diff --git a/clients/fft_params.h b/clients/fft_params.h
-index d21ba85..af65c25 100644
---- a/clients/fft_params.h
-+++ b/clients/fft_params.h
-@@ -28,7 +28,6 @@
- #include <iostream>
- #include <mutex>
- #include <numeric>
--#include <omp.h>
- #include <random>
- #include <tuple>
- #include <unordered_set>
diff -Nru rocfft-5.5.0/debian/patches/0002-disable-fftw-install.patch rocfft-5.7.1/debian/patches/0002-disable-fftw-install.patch
--- rocfft-5.5.0/debian/patches/0002-disable-fftw-install.patch	2023-11-10 09:02:29.000000000 +0000
+++ rocfft-5.7.1/debian/patches/0002-disable-fftw-install.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,37 +0,0 @@
-From afc1c41b4fea1ed7e4797236e3e666c9453033c2 Mon Sep 17 00:00:00 2001
-From: Steve Leung <Steve.Leung@amd.com>
-Date: Wed, 21 Dec 2022 11:27:39 -0700
-Subject: [PATCH] cmake: only install fftw if we built it
-
----
- clients/tests/CMakeLists.txt | 13 +++++--------
- 1 file changed, 5 insertions(+), 8 deletions(-)
-
-diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt
-index fe5781f8..5d865908 100644
---- a/clients/tests/CMakeLists.txt
-+++ b/clients/tests/CMakeLists.txt
-@@ -161,18 +161,15 @@ if( BUILD_FFTW OR NOT FFTW_FOUND )
- 
-   # FFTW we build is always threaded
-   set( FFTW_MULTITHREAD TRUE )
--endif()
- 
--if( BUILD_FFTW OR NOT FFTW_FOUND )
-   add_dependencies( rocfft-test fftw_double fftw_single )
-+  rocm_install(
-+    FILES ${FFTW_LIBRARIES}
-+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/fftw
-+    COMPONENT clients-common
-+  )
- endif()
- 
--rocm_install(
--  FILES ${FFTW_LIBRARIES}
--  DESTINATION ${CMAKE_INSTALL_LIBDIR}/fftw
--  COMPONENT clients-common
--)
--
- set( rocfft-test_include_dirs
-   $<BUILD_INTERFACE:${Boost_INCLUDE_DIRS}>
-   $<BUILD_INTERFACE:${FFTW_INCLUDES}>
diff -Nru rocfft-5.5.0/debian/patches/0003-fix-sample-includes.patch rocfft-5.7.1/debian/patches/0003-fix-sample-includes.patch
--- rocfft-5.5.0/debian/patches/0003-fix-sample-includes.patch	2023-11-10 09:02:29.000000000 +0000
+++ rocfft-5.7.1/debian/patches/0003-fix-sample-includes.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,94 +0,0 @@
-From 3d6eed1651850c0a7669c75d344e2d54193e3a16 Mon Sep 17 00:00:00 2001
-From: Cory Bloor <Cordell.Bloor@amd.com>
-Date: Mon, 13 Feb 2023 13:36:49 -0700
-Subject: [PATCH] Fix sample include statements (#1013)
-
-For the samples to be built separately from rocfft project, the include
-style needs to be <rocfft/rocfft.h>.
----
- docs/samples/complex_1d.cpp      | 2 +-
- docs/samples/complex_2d.cpp      | 2 +-
- docs/samples/complex_3d.cpp      | 2 +-
- docs/samples/real2complex_1d.cpp | 2 +-
- docs/samples/real2complex_2d.cpp | 2 +-
- docs/samples/real2complex_3d.cpp | 2 +-
- 6 files changed, 6 insertions(+), 6 deletions(-)
-
-diff --git a/docs/samples/complex_1d.cpp b/docs/samples/complex_1d.cpp
-index 5ee9347a..e815f152 100644
---- a/docs/samples/complex_1d.cpp
-+++ b/docs/samples/complex_1d.cpp
-@@ -25,7 +25,7 @@
- 
- #include <hip/hip_runtime_api.h>
- 
--#include "rocfft.h"
-+#include <rocfft/rocfft.h>
- 
- int main(int argc, char* argv[])
- {
-diff --git a/docs/samples/complex_2d.cpp b/docs/samples/complex_2d.cpp
-index 0010aa70..9da818be 100644
---- a/docs/samples/complex_2d.cpp
-+++ b/docs/samples/complex_2d.cpp
-@@ -25,7 +25,7 @@
- 
- #include <hip/hip_runtime_api.h>
- 
--#include "rocfft.h"
-+#include <rocfft/rocfft.h>
- 
- int main(int argc, char* argv[])
- {
-diff --git a/docs/samples/complex_3d.cpp b/docs/samples/complex_3d.cpp
-index ae8f60bd..b547d7ee 100644
---- a/docs/samples/complex_3d.cpp
-+++ b/docs/samples/complex_3d.cpp
-@@ -25,7 +25,7 @@
- 
- #include <hip/hip_runtime_api.h>
- 
--#include "rocfft.h"
-+#include <rocfft/rocfft.h>
- 
- int main(int argc, char* argv[])
- {
-diff --git a/docs/samples/real2complex_1d.cpp b/docs/samples/real2complex_1d.cpp
-index 4ef12d2d..8043bd97 100644
---- a/docs/samples/real2complex_1d.cpp
-+++ b/docs/samples/real2complex_1d.cpp
-@@ -25,7 +25,7 @@
- 
- #include <hip/hip_runtime_api.h>
- 
--#include "rocfft.h"
-+#include <rocfft/rocfft.h>
- 
- int main(int argc, char* argv[])
- {
-diff --git a/docs/samples/real2complex_2d.cpp b/docs/samples/real2complex_2d.cpp
-index 320e027e..06ecdd92 100644
---- a/docs/samples/real2complex_2d.cpp
-+++ b/docs/samples/real2complex_2d.cpp
-@@ -25,7 +25,7 @@
- 
- #include <hip/hip_runtime_api.h>
- 
--#include "rocfft.h"
-+#include <rocfft/rocfft.h>
- 
- int main(int argc, char* argv[])
- {
-diff --git a/docs/samples/real2complex_3d.cpp b/docs/samples/real2complex_3d.cpp
-index 854cdc09..baec2dfe 100644
---- a/docs/samples/real2complex_3d.cpp
-+++ b/docs/samples/real2complex_3d.cpp
-@@ -25,7 +25,7 @@
- 
- #include <hip/hip_runtime_api.h>
- 
--#include "rocfft.h"
-+#include <rocfft/rocfft.h>
- 
- int main(int argc, char* argv[])
- {
diff -Nru rocfft-5.5.0/debian/patches/0004-fix-hiprtc-link.patch rocfft-5.7.1/debian/patches/0004-fix-hiprtc-link.patch
--- rocfft-5.5.0/debian/patches/0004-fix-hiprtc-link.patch	2023-11-10 09:02:29.000000000 +0000
+++ rocfft-5.7.1/debian/patches/0004-fix-hiprtc-link.patch	2024-03-12 17:13:18.000000000 +0000
@@ -15,7 +15,7 @@
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt
-index ea78131..21c7d81 100644
+index 51faaff..f842795 100644
 --- a/library/src/CMakeLists.txt
 +++ b/library/src/CMakeLists.txt
 @@ -46,7 +46,7 @@ else()
diff -Nru rocfft-5.5.0/debian/patches/0005-add-debian-path-to-rocfft_rtc_helper.patch rocfft-5.7.1/debian/patches/0005-add-debian-path-to-rocfft_rtc_helper.patch
--- rocfft-5.5.0/debian/patches/0005-add-debian-path-to-rocfft_rtc_helper.patch	2023-11-10 09:02:29.000000000 +0000
+++ rocfft-5.7.1/debian/patches/0005-add-debian-path-to-rocfft_rtc_helper.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,52 +0,0 @@
-From c2a92b6f25067aca8c603a065d3d63617f4d1f9c Mon Sep 17 00:00:00 2001
-From: Cory Bloor <Cordell.Bloor@amd.com>
-Date: Tue, 11 Apr 2023 17:08:24 -0600
-Subject: [PATCH] Search Debian libexec dir for rocfft_rtc_helper (#1064)
-
-On Debian, the rocfft_rtc_helper can be found at
-/usr/lib/<multiarch>/rocfft/<version>/rocfft_rtc_helper
----
- library/src/CMakeLists.txt     | 3 +++
- library/src/rtc_subprocess.cpp | 8 ++++++++
- 2 files changed, 11 insertions(+)
-
-diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt
-index 9008c7eb..1724dfeb 100644
---- a/library/src/CMakeLists.txt
-+++ b/library/src/CMakeLists.txt
-@@ -261,6 +261,9 @@ add_library( rocfft-rtc-compile OBJECT
- add_library( rocfft-rtc-subprocess OBJECT
-   rtc_subprocess.cpp
- )
-+target_compile_definitions( rocfft-rtc-subprocess PRIVATE
-+  -DROCFFT_VERSION=${VERSION_STRING}
-+)
- # generation of kernel source
- add_library( rocfft-rtc-gen OBJECT
-   rtc_bluestein_gen.cpp
-diff --git a/library/src/rtc_subprocess.cpp b/library/src/rtc_subprocess.cpp
-index 8d85b467..e8deeec5 100644
---- a/library/src/rtc_subprocess.cpp
-+++ b/library/src/rtc_subprocess.cpp
-@@ -53,6 +53,10 @@ static const char* HELPER_EXE = "rocfft_rtc_helper";
- typedef int        file_handle_type;
- #endif
- 
-+#define TO_STR2(x) #x
-+#define TO_STR(x) TO_STR2(x)
-+#define ROCFFT_VERSION_STRING TO_STR(ROCFFT_VERSION)
-+
- static fs::path find_rtc_helper()
- {
-     // candidate directories for the helper
-@@ -69,6 +73,10 @@ static fs::path find_rtc_helper()
-         fs::path library_parent_path = library_path.parent_path();
-         helper_dirs.push_back(library_parent_path);
- 
-+        // try in a versioned library subdirectory
-+        fs::path subdir_path = library_path.parent_path() / "rocfft" / ROCFFT_VERSION_STRING;
-+        helper_dirs.push_back(subdir_path);
-+
-         // try bin dir, one dir up from library
-         fs::path bin_path = library_parent_path.parent_path() / "bin";
-         helper_dirs.push_back(bin_path);
diff -Nru rocfft-5.5.0/debian/patches/0005-use-readthedocs-theme.patch rocfft-5.7.1/debian/patches/0005-use-readthedocs-theme.patch
--- rocfft-5.5.0/debian/patches/0005-use-readthedocs-theme.patch	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/debian/patches/0005-use-readthedocs-theme.patch	2024-03-12 17:13:18.000000000 +0000
@@ -0,0 +1,196 @@
+From: Christian Kastner <ckk@debian.org>
+Date: Fri, 1 Mar 2024 22:36:47 +0100
+Subject: Use readthedocs theme
+
+The newer documentation build requires packages not yet available, so we
+simply revert to conf.py from the 5.5.1 release for now.
+
+Forwarded: not-needed
+---
+ docs/conf.py | 177 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 171 insertions(+), 6 deletions(-)
+
+diff --git a/docs/conf.py b/docs/conf.py
+index 101fe22..f8d5e68 100644
+--- a/docs/conf.py
++++ b/docs/conf.py
+@@ -1,8 +1,173 @@
+-from rocm_docs import ROCmDocs
++# -*- coding: utf-8 -*-
++#
++# rocFFT documentation build configuration file, created by
++# sphinx-quickstart on Mon Jan  8 16:34:42 2018.
++#
++# This file is execfile()d with the current directory set to its
++# containing dir.
++#
++# Note that not all possible configuration values are present in this
++# autogenerated file.
++#
++# All configuration values have a default; values that are commented out
++# serve to show the default.
+ 
+-docs_core = ROCmDocs("rocFFT Documentation")
+-docs_core.run_doxygen()
+-docs_core.setup()
++# If extensions (or modules to document with autodoc) are in another directory,
++# add these directories to sys.path here. If the directory is relative to the
++# documentation root, use os.path.abspath to make it absolute, like shown here.
++#
++# import os
++# import sys
++# sys.path.insert(0, os.path.abspath('.'))
+ 
+-for sphinx_var in ROCmDocs.SPHINX_VARS:
+-    globals()[sphinx_var] = getattr(docs_core, sphinx_var)
++import os
++import sys
++import subprocess
++
++read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
++
++if read_the_docs_build:
++    subprocess.call('../run_doxygen.sh')
++
++# -- General configuration ------------------------------------------------
++
++# If your documentation needs a minimal Sphinx version, state it here.
++#
++# needs_sphinx = '1.0'
++
++# Add any Sphinx extension module names here, as strings. They can be
++# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
++# ones.
++extensions = ['sphinx.ext.mathjax', 'breathe']
++breathe_projects = {"rocFFT": ".doxygen/docBin/xml"}
++breathe_default_project = "rocFFT"
++
++# Add any paths that contain templates here, relative to this directory.
++templates_path = ['_templates']
++
++# The suffix(es) of source filenames.
++# You can specify multiple suffix as a list of string:
++#
++# source_suffix = ['.rst', '.md']
++source_suffix = '.rst'
++
++# The master toctree document.
++master_doc = 'index'
++
++# General information about the project.
++project = u'rocFFT'
++copyright = u'2016 - 2023, Advanced Micro Devices'
++author = u'Advanced Micro Devices, Inc.'
++
++# The version info for the project you're documenting, acts as replacement for
++# |version| and |release|, also used in various other places throughout the
++# built documents.
++#
++# The short X.Y version.
++version = u'5.7.1'
++# The full version, including alpha/beta/rc tags.
++release = u'5.7.1'
++
++# The language for content autogenerated by Sphinx. Refer to documentation
++# for a list of supported languages.
++#
++# This is also used if you do content translation via gettext catalogs.
++# Usually you set "language" from the command line for these cases.
++language = None
++
++# List of patterns, relative to source directory, that match files and
++# directories to ignore when looking for source files.
++# This patterns also effect to html_static_path and html_extra_path
++exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
++
++# The name of the Pygments (syntax highlighting) style to use.
++pygments_style = 'sphinx'
++
++# If true, `todo` and `todoList` produce output, else they produce nothing.
++todo_include_todos = False
++
++# -- Options for HTML output ----------------------------------------------
++
++# The theme to use for HTML and HTML Help pages.  See the documentation for
++# a list of builtin themes.
++#
++# html_theme = 'alabaster'
++
++if read_the_docs_build:
++    html_theme = 'default'
++else:
++    import sphinx_rtd_theme
++    html_theme = "sphinx_rtd_theme"
++    html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
++
++# Theme options are theme-specific and customize the look and feel of a theme
++# further.  For a list of options available for each theme, see the
++# documentation.
++#
++# html_theme_options = {}
++
++# Add any paths that contain custom static files (such as style sheets) here,
++# relative to this directory. They are copied after the builtin static files,
++# so a file named "default.css" will overwrite the builtin "default.css".
++# html_static_path = ['_static']
++
++# Custom sidebar templates, must be a dictionary that maps document names
++# to template names.
++#
++# This is required for the alabaster theme
++# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
++# html_sidebars = {
++#     '**': [
++#         'relations.html',  # needs 'show_related': True theme option to display
++#         'searchbox.html',
++#     ]
++# }
++
++# -- Options for HTMLHelp output ------------------------------------------
++
++# Output file base name for HTML help builder.
++htmlhelp_basename = 'rocFFTdoc'
++
++# -- Options for LaTeX output ---------------------------------------------
++
++latex_elements = {
++    # The paper size ('letterpaper' or 'a4paper').
++    #
++    # 'papersize': 'letterpaper',
++
++    # The font size ('10pt', '11pt' or '12pt').
++    #
++    # 'pointsize': '10pt',
++
++    # Additional stuff for the LaTeX preamble.
++    #
++    # 'preamble': '',
++
++    # Latex figure (float) alignment
++    #
++    # 'figure_align': 'htbp',
++}
++
++# Grouping the document tree into LaTeX files. List of tuples
++# (source start file, target name, title,
++#  author, documentclass [howto, manual, or own class]).
++latex_documents = [
++    (master_doc, 'rocFFT.tex', u'rocFFT Documentation',
++     u'Advanced Micro Devices', 'manual'),
++]
++
++# -- Options for manual page output ---------------------------------------
++
++# One entry per manual page. List of tuples
++# (source start file, name, description, authors, manual section).
++man_pages = [(master_doc, 'rocfft', u'rocFFT Documentation', [author], 1)]
++
++# -- Options for Texinfo output -------------------------------------------
++
++# Grouping the document tree into Texinfo files. List of tuples
++# (source start file, target name, title, author,
++#  dir menu entry, description, category)
++texinfo_documents = [
++    (master_doc, 'rocFFT', u'rocFFT Documentation', author, 'rocFFT',
++     'One line description of project.', 'Miscellaneous'),
++]
diff -Nru rocfft-5.5.0/debian/patches/0006-use-local-mathjax.patch rocfft-5.7.1/debian/patches/0006-use-local-mathjax.patch
--- rocfft-5.5.0/debian/patches/0006-use-local-mathjax.patch	2023-11-10 09:02:29.000000000 +0000
+++ rocfft-5.7.1/debian/patches/0006-use-local-mathjax.patch	2024-03-12 17:13:18.000000000 +0000
@@ -8,13 +8,13 @@
 
 Forwarded: not-needed
 ---
- docs/source/conf.py | 3 +++
+ docs/conf.py | 3 +++
  1 file changed, 3 insertions(+)
 
-diff --git a/docs/source/conf.py b/docs/source/conf.py
-index d3b33df..2f35c0e 100644
---- a/docs/source/conf.py
-+++ b/docs/source/conf.py
+diff --git a/docs/conf.py b/docs/conf.py
+index 66c6b01..01ba07a 100644
+--- a/docs/conf.py
++++ b/docs/conf.py
 @@ -171,3 +171,6 @@ texinfo_documents = [
      (master_doc, 'rocFFT', u'rocFFT Documentation', author, 'rocFFT',
       'One line description of project.', 'Miscellaneous'),
diff -Nru rocfft-5.5.0/debian/patches/0007-disable-kernel-cache-build.patch rocfft-5.7.1/debian/patches/0007-disable-kernel-cache-build.patch
--- rocfft-5.5.0/debian/patches/0007-disable-kernel-cache-build.patch	2023-11-10 09:02:29.000000000 +0000
+++ rocfft-5.7.1/debian/patches/0007-disable-kernel-cache-build.patch	2024-03-12 17:13:18.000000000 +0000
@@ -8,14 +8,14 @@
 
 Forwarded: not-needed
 ---
- library/src/CMakeLists.txt | 28 ----------------------------
- 1 file changed, 28 deletions(-)
+ library/src/CMakeLists.txt | 26 --------------------------
+ 1 file changed, 26 deletions(-)
 
-diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt
-index 21c7d81..1ce5522 100644
---- a/library/src/CMakeLists.txt
-+++ b/library/src/CMakeLists.txt
-@@ -395,23 +395,6 @@ endif()
+Index: rocfft-5.7.1/library/src/CMakeLists.txt
+===================================================================
+--- rocfft-5.7.1.orig/library/src/CMakeLists.txt
++++ rocfft-5.7.1/library/src/CMakeLists.txt
+@@ -497,23 +497,6 @@ endif()
  # build.  any kernels that already exist in this file will be reused
  # between builds.
  
@@ -39,21 +39,26 @@
  rocm_set_soversion( rocfft ${rocfft_SOVERSION} )
  set_target_properties( rocfft PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
  set_target_properties( rocfft PROPERTIES DEBUG_POSTFIX "-d" )
-@@ -443,17 +426,6 @@ rocm_install_targets(
+@@ -545,22 +528,6 @@ rocm_install_targets(
    ${CMAKE_BINARY_DIR}/include
    )
  
--# kernel cache needs to go next to the library - Linux puts shared
+-# kernel cache is architecture-dependent data for the library, placed
+-# in a rocFFT subdirectory next to the library.  Linux puts shared
 -# objects in lib, Windows puts DLLs in bin
 -if(WIN32)
--  set(ROCFFT_KERNEL_CACHE_INSTALL_DIR ${CMAKE_INSTALL_BINDIR})
+-  set(ROCFFT_KERNEL_CACHE_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}/rocfft)
 -else()
--  set(ROCFFT_KERNEL_CACHE_INSTALL_DIR ${ROCM_INSTALL_LIBDIR})
+-  set(ROCFFT_KERNEL_CACHE_INSTALL_DIR ${ROCM_INSTALL_LIBDIR}/rocfft)
 -endif()
--rocm_install(FILES ${ROCFFT_KERNEL_CACHE_PATH}
--  DESTINATION "${ROCFFT_KERNEL_CACHE_INSTALL_DIR}"
--  COMPONENT runtime
--)
- 
- #         PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
- 
+-
+-if( NOT ENABLE_ASAN_PACKAGING )
+-  rocm_install(FILES ${ROCFFT_KERNEL_CACHE_PATH}
+-    DESTINATION "${ROCFFT_KERNEL_CACHE_INSTALL_DIR}"
+-    COMPONENT runtime
+-  )
+-endif()
+-
+ # rtc helper is an internal library executable on Linux, placed in a
+ # rocFFT subdirectory of the library directory.  On Windows it goes
+ # into bin next to the library, to simplify finding DLLs.
diff -Nru rocfft-5.5.0/debian/patches/series rocfft-5.7.1/debian/patches/series
--- rocfft-5.5.0/debian/patches/series	2023-11-10 09:02:29.000000000 +0000
+++ rocfft-5.7.1/debian/patches/series	2024-03-12 17:13:18.000000000 +0000
@@ -1,7 +1,4 @@
-0001-remove-use-of-openmp.patch
-0002-disable-fftw-install.patch
-0003-fix-sample-includes.patch
 0004-fix-hiprtc-link.patch
-0005-add-debian-path-to-rocfft_rtc_helper.patch
+0005-use-readthedocs-theme.patch
 0006-use-local-mathjax.patch
 0007-disable-kernel-cache-build.patch
diff -Nru rocfft-5.5.0/debian/rules rocfft-5.7.1/debian/rules
--- rocfft-5.5.0/debian/rules	2023-11-10 09:02:29.000000000 +0000
+++ rocfft-5.7.1/debian/rules	2024-03-12 17:13:18.000000000 +0000
@@ -1,22 +1,28 @@
 #!/usr/bin/make -f
 export CXX=hipcc
 export DEB_BUILD_MAINT_OPTIONS = hardening=+all optimize=-lto
-export DEB_CXXFLAGS_MAINT_PREPEND = -gdwarf-4
+export DEB_CXXFLAGS_MAINT_PREPEND = -gz
 export VERBOSE=1
 
 # filter incompatible options from affecting device code
 CXXFLAGS := $(subst -fstack-protector-strong,-Xarch_host -fstack-protector-strong,$(CXXFLAGS))
 CXXFLAGS := $(subst -fcf-protection,-Xarch_host -fcf-protection,$(CXXFLAGS))
 
+# For installation rocfft_rtc_helper
+VERSION_STRING = $(shell sed -nr 's/^set.*VERSION_STRING \"([.0-9]+)\".*/\1/p' CMakeLists.txt)
+
 CMAKE_FLAGS = \
 	-DCMAKE_BUILD_TYPE=Release \
-	-DAMDGPU_TARGETS="gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1010;gfx1011;gfx1030" \
+	-DAMDGPU_TARGETS="gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1010;gfx1011;gfx1030;gfx1100;gfx1101;gfx1102" \
 	-DROCM_SYMLINK_LIBS=OFF \
 	-DBUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF \
-	-DBUILD_CLIENTS_TESTS=ON \
 	-DBUILD_CLIENTS_TESTS_OPENMP=OFF \
 	-DSQLITE_USE_SYSTEM_PACKAGE=ON
 
+ifeq (,$(filter nocheck,$(DEB_BUILD_PROFILES)))
+CMAKE_FLAGS += -DBUILD_CLIENTS_TESTS=ON
+endif
+
 %:
 	dh $@ -Scmake
 
@@ -34,22 +40,15 @@
 endif
 
 execute_before_dh_install-arch:
-	# Note the rpath field setting may cause reproducible build issues.
-	# This should be removed earlier in the toolchain if possible.
-	patchelf --remove-rpath ./debian/tmp/usr/lib/$(DEB_HOST_MULTIARCH)/librocfft.so.0.1
-	patchelf --remove-rpath ./debian/tmp/usr/bin/rocfft_rtc_helper
-	patchelf --remove-rpath ./debian/tmp/usr/bin/rocfft-test
-	# move rocfft_rtc_helper to a libexec directory
-	mkdir -p ./debian/tmp/usr/lib/$(DEB_HOST_MULTIARCH)/rocfft/1.0.21
-	mv ./debian/tmp/usr/bin/rocfft_rtc_helper ./debian/tmp/usr/lib/$(DEB_HOST_MULTIARCH)/rocfft/1.0.21/
+	dh_install -plibrocfft0 usr/lib/*/rocfft/${VERSION_STRING}/rocfft_rtc_helper
 
 override_dh_auto_configure-indep:
 	:
 
 override_dh_auto_build-indep:
 ifeq (,$(filter nodoc,$(DEB_BUILD_OPTIONS)))
-	cd docs; doxygen
-	sphinx-build -b html docs/source html
+	cd docs/.doxygen && doxygen
+	sphinx-build -b html docs html
 endif
 
 override_dh_auto_test-indep:
@@ -57,3 +56,7 @@
 
 override_dh_auto_install-indep:
 	:
+
+# dwz doesn't fully support DWARF-5 yet, see #1016936
+override_dh_dwz:
+	:
diff -Nru rocfft-5.5.0/debian/tests/control rocfft-5.7.1/debian/tests/control
--- rocfft-5.5.0/debian/tests/control	2023-11-10 09:02:29.000000000 +0000
+++ rocfft-5.7.1/debian/tests/control	2024-03-12 17:13:18.000000000 +0000
@@ -1,5 +1,5 @@
 Test-Command: /bin/sh debian/tests/upstream-binaries librocfft0-tests
 Depends: librocfft0-tests
-Restrictions: skippable, allow-stderr
+Restrictions: skippable, allow-stderr, needs-sudo
 Architecture: amd64 arm64 ppc64el
 
diff -Nru rocfft-5.5.0/debian/tests/upstream-binaries rocfft-5.7.1/debian/tests/upstream-binaries
--- rocfft-5.5.0/debian/tests/upstream-binaries	2023-11-10 09:02:29.000000000 +0000
+++ rocfft-5.7.1/debian/tests/upstream-binaries	2024-03-12 17:13:18.000000000 +0000
@@ -7,7 +7,7 @@
 #   /usr/libexec/rocm/$1
 #
 # Will run all executables in that directory, and exit with status=1 if
-# any failure occured, otherwise with status=0. A failure is defined as an
+# any failure occurred, otherwise with status=0. A failure is defined as an
 # executable exiting with a status != 0.
 
 
@@ -19,7 +19,7 @@
 	echo "Skipping tests."
 	# Magic number to signal 'skipped'
 	exit 77
-elif [ "`id -u`" != "0" ] && [ ! -r /dev/kfd ]
+elif [ "$(id -u)" != "0" ] && [ ! -r /dev/kfd ]
 then
 	echo "/dev/kfd present but no read permission."
 	echo "Skipping tests."
@@ -36,13 +36,37 @@
 	exit 1
 fi
 
-cd "$AUTOPKGTEST_TMP"
+# 16 = testbed failure
+cd "$AUTOPKGTEST_TMP" || exit 16
+
+# First, gather system info
+sudo -n mount -t debugfs none /sys/kernel/debug || true
+if sudo -n [ -d /sys/kernel/debug/dri ]
+then
+	for index in $(sudo -n ls /sys/kernel/debug/dri)
+	do
+		info="/sys/kernel/debug/dri/$index/amdgpu_firmware_info"
+		if sudo -n [ -f "$info" ]
+		then
+			# shellcheck disable=SC2024   # we don't need privileged write
+			sudo -n cat "$info" > "$AUTOPKGTEST_ARTIFACTS/amdgpu_firmware_info.$index"
+		fi
+	done
+else
+	echo "Could not read /sys/kernel/debug/dri" >> "$AUTOPKGTEST_ARTIFACTS/firmware.err"
+fi
+# shellcheck disable=SC2024   # we don't need privileged write
+sudo -n dmesg > "$AUTOPKGTEST_ARTIFACTS/dmesg.before" || true
 
 # Any individual failure is overall failure
 EXITCODE=0
-for TESTNAME in $TESTSDIR/*
+for TESTNAME in "$TESTSDIR"/*
 do
 	$TESTNAME || EXITCODE=1
 done
 
+# Tests might have generated new messages
+# shellcheck disable=SC2024   # we don't need privileged write
+sudo -n dmesg > "$AUTOPKGTEST_ARTIFACTS/dmesg.after" || true
+
 exit $EXITCODE
diff -Nru rocfft-5.5.0/debian/upstream/metadata rocfft-5.7.1/debian/upstream/metadata
--- rocfft-5.5.0/debian/upstream/metadata	2023-11-10 09:02:29.000000000 +0000
+++ rocfft-5.7.1/debian/upstream/metadata	2024-03-12 17:13:18.000000000 +0000
@@ -1,4 +1,4 @@
 ---
-Bug-Database: https://github.com/ROCmSoftwarePlatform/rocFFT/issues
-Bug-Submit: https://github.com/ROCmSoftwarePlatform/rocFFT/issues/new
-Repository-Browse: https://github.com/rocmsoftwareplatform/rocfft
+Bug-Database: https://github.com/ROCm/rocFFT/issues
+Bug-Submit: https://github.com/ROCm/rocFFT/issues/new
+Repository-Browse: https://github.com/ROCm/rocfft
diff -Nru rocfft-5.5.0/docs/.doxygen/Doxyfile rocfft-5.7.1/docs/.doxygen/Doxyfile
--- rocfft-5.5.0/docs/.doxygen/Doxyfile	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/docs/.doxygen/Doxyfile	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,2458 @@
+# Doxyfile 1.8.10
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "rocFFT"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         = v1.0.23
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          = "prototype interfaces compatible with ROCm platform and HiP"
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           = 
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = docBin
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        = ../../library/include
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    = 
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES                = 
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              = 
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
+# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
+# Fortran. In the later case the parser tries to guess whether the code is fixed
+# or free formatted code, this is the default for Fortran type files), VHDL. For
+# instance to make doxygen treat .inc files as Fortran files (default is PHP),
+# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      = 
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = YES
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = YES
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+SHOW_NAMESPACES        = NO
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO, these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       = 
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    = 
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            = 
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = ../../library/include/
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd,
+# *.vhdl, *.ucf, *.qsf, *.as and *.js.
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.idl \
+                         *.ddl \
+                         *.odl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.cs \
+                         *.d \
+                         *.php \
+                         *.php4 \
+                         *.php5 \
+                         *.phtml \
+                         *.inc \
+                         *.m \
+                         *.markdown \
+                         *.md \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.f90 \
+                         *.f \
+                         *.for \
+                         *.tcl \
+                         *.vhd \
+                         *.vhdl \
+                         *.ucf \
+                         *.qsf \
+                         *.as \
+                         *.js
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                = 
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = 
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        = 
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           = 
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             = 
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+
+INPUT_FILTER           = 
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        = 
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS = 
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE = ../README.md
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# compiled with the --with-libclang option.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            = 
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            = 
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        = 
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  = 
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       = 
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               = 
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           = 
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     = 
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               = 
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   = 
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  = 
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  = 
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           = 
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 1
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = YES
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     = 
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       = 
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       = 
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     = 
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         = 
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           = 
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           = 
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET = 
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      = 
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = YES
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = YES
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    = 
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    = 
+
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             = 
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = YES 
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sf.net) file that captures the
+# structure of the code including all documentation. Note that this feature is
+# still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = YES
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = NO
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           = 
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  = 
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             = __attribute__(x)= \
+                         __inline= \
+                         ROCFFT_EXPORT=
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      = 
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               = 
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       = 
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = NO
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            = 
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               = 
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           = 
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               = 
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           = 
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           = 
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           = 
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      = 
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  = 
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff -Nru rocfft-5.5.0/docs/.sphinx/_toc.yml.in rocfft-5.7.1/docs/.sphinx/_toc.yml.in
--- rocfft-5.5.0/docs/.sphinx/_toc.yml.in	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/docs/.sphinx/_toc.yml.in	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,7 @@
+root: index
+subtrees:
+  - numbered: False
+    entries:
+    - file: design/design
+    - file: api
+    - file: allapi
diff -Nru rocfft-5.5.0/docs/.sphinx/requirements.in rocfft-5.7.1/docs/.sphinx/requirements.in
--- rocfft-5.5.0/docs/.sphinx/requirements.in	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/docs/.sphinx/requirements.in	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1 @@
+rocm-docs-core==0.10.3
diff -Nru rocfft-5.5.0/docs/.sphinx/requirements.txt rocfft-5.7.1/docs/.sphinx/requirements.txt
--- rocfft-5.5.0/docs/.sphinx/requirements.txt	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/docs/.sphinx/requirements.txt	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,152 @@
+#
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
+#
+#    pip-compile requirements.in
+#
+accessible-pygments==0.0.3
+    # via pydata-sphinx-theme
+alabaster==0.7.13
+    # via sphinx
+babel==2.12.1
+    # via
+    #   pydata-sphinx-theme
+    #   sphinx
+beautifulsoup4==4.11.2
+    # via pydata-sphinx-theme
+breathe==4.34.0
+    # via rocm-docs-core
+certifi==2022.12.7
+    # via requests
+cffi==1.15.1
+    # via
+    #   cryptography
+    #   pynacl
+charset-normalizer==3.1.0
+    # via requests
+click==8.1.3
+    # via sphinx-external-toc
+cryptography==40.0.2
+    # via pyjwt
+deprecated==1.2.13
+    # via pygithub
+docutils==0.19
+    # via
+    #   breathe
+    #   myst-parser
+    #   pydata-sphinx-theme
+    #   sphinx
+gitdb==4.0.10
+    # via gitpython
+gitpython==3.1.31
+    # via rocm-docs-core
+idna==3.4
+    # via requests
+imagesize==1.4.1
+    # via sphinx
+importlib-metadata==6.0.0
+    # via sphinx
+importlib-resources==5.12.0
+    # via rocm-docs-core
+jinja2==3.1.2
+    # via
+    #   myst-parser
+    #   sphinx
+linkify-it-py==1.0.3
+    # via myst-parser
+markdown-it-py==2.2.0
+    # via
+    #   mdit-py-plugins
+    #   myst-parser
+markupsafe==2.1.2
+    # via jinja2
+mdit-py-plugins==0.3.5
+    # via myst-parser
+mdurl==0.1.2
+    # via markdown-it-py
+myst-parser[linkify]==1.0.0
+    # via rocm-docs-core
+packaging==23.0
+    # via
+    #   pydata-sphinx-theme
+    #   sphinx
+pycparser==2.21
+    # via cffi
+pydata-sphinx-theme==0.13.3
+    # via
+    #   rocm-docs-core
+    #   sphinx-book-theme
+pygithub==1.58.1
+    # via rocm-docs-core
+pygments==2.14.0
+    # via
+    #   accessible-pygments
+    #   pydata-sphinx-theme
+    #   sphinx
+pyjwt[crypto]==2.6.0
+    # via pygithub
+pynacl==1.5.0
+    # via pygithub
+pytz==2023.3
+    # via babel
+pyyaml==6.0
+    # via
+    #   myst-parser
+    #   sphinx-external-toc
+requests==2.28.2
+    # via
+    #   pygithub
+    #   sphinx
+rocm-docs-core==0.10.3
+    # via -r requirements.in
+smmap==5.0.0
+    # via gitdb
+snowballstemmer==2.2.0
+    # via sphinx
+soupsieve==2.4
+    # via beautifulsoup4
+sphinx==5.3.0
+    # via
+    #   breathe
+    #   myst-parser
+    #   pydata-sphinx-theme
+    #   rocm-docs-core
+    #   sphinx-book-theme
+    #   sphinx-copybutton
+    #   sphinx-design
+    #   sphinx-external-toc
+    #   sphinx-notfound-page
+sphinx-book-theme==1.0.1
+    # via rocm-docs-core
+sphinx-copybutton==0.5.1
+    # via rocm-docs-core
+sphinx-design==0.4.1
+    # via rocm-docs-core
+sphinx-external-toc==0.3.1
+    # via rocm-docs-core
+sphinx-notfound-page==0.8.3
+    # via rocm-docs-core
+sphinxcontrib-applehelp==1.0.4
+    # via sphinx
+sphinxcontrib-devhelp==1.0.2
+    # via sphinx
+sphinxcontrib-htmlhelp==2.0.1
+    # via sphinx
+sphinxcontrib-jsmath==1.0.1
+    # via sphinx
+sphinxcontrib-qthelp==1.0.3
+    # via sphinx
+sphinxcontrib-serializinghtml==1.1.5
+    # via sphinx
+typing-extensions==4.5.0
+    # via pydata-sphinx-theme
+uc-micro-py==1.0.1
+    # via linkify-it-py
+urllib3==1.26.15
+    # via requests
+wrapt==1.15.0
+    # via deprecated
+zipp==3.15.0
+    # via
+    #   importlib-metadata
+    #   importlib-resources
diff -Nru rocfft-5.5.0/docs/Doxyfile rocfft-5.7.1/docs/Doxyfile
--- rocfft-5.5.0/docs/Doxyfile	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/docs/Doxyfile	1970-01-01 00:00:00.000000000 +0000
@@ -1,2458 +0,0 @@
-# Doxyfile 1.8.10
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project.
-#
-# All text after a double hash (##) is considered a comment and is placed in
-# front of the TAG it is preceding.
-#
-# All text after a single hash (#) is considered a comment and will be ignored.
-# The format is:
-# TAG = value [value, ...]
-# For lists, items can also be appended using:
-# TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (\" \").
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
-# The default value is: UTF-8.
-
-DOXYFILE_ENCODING      = UTF-8
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
-# double-quotes, unless you are using Doxywizard) that should identify the
-# project for which the documentation is generated. This name is used in the
-# title of most generated pages and in a few other places.
-# The default value is: My Project.
-
-PROJECT_NAME           = "rocFFT"
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
-# could be handy for archiving the generated documentation or if some version
-# control system is used.
-
-PROJECT_NUMBER         = v1.0.21
-
-# Using the PROJECT_BRIEF tag one can provide an optional one line description
-# for a project that appears at the top of each page and should give viewer a
-# quick idea about the purpose of the project. Keep the description short.
-
-PROJECT_BRIEF          = "prototype interfaces compatible with ROCm platform and HiP"
-
-# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
-# in the documentation. The maximum height of the logo should not exceed 55
-# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
-# the logo to the output directory.
-
-PROJECT_LOGO           = ./rocm.jpg
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
-# into which the generated documentation will be written. If a relative path is
-# entered, it will be relative to the location where doxygen was started. If
-# left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       = docBin
-
-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
-# option can be useful when feeding doxygen a huge amount of source files, where
-# putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
-# The default value is: NO.
-
-CREATE_SUBDIRS         = NO
-
-# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
-# characters to appear in the names of generated files. If set to NO, non-ASCII
-# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
-# U+3044.
-# The default value is: NO.
-
-ALLOW_UNICODE_NAMES    = NO
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
-# The default value is: English.
-
-OUTPUT_LANGUAGE        = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
-# descriptions after the members that are listed in the file and class
-# documentation (similar to Javadoc). Set to NO to disable this.
-# The default value is: YES.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
-# description of a member or function before the detailed description
-#
-# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-# The default value is: YES.
-
-REPEAT_BRIEF           = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator that is
-# used to form the text in various listings. Each string in this list, if found
-# as the leading text of the brief description, will be stripped from the text
-# and the result, after processing the whole list, is used as the annotated
-# text. Otherwise, the brief description is used as-is. If left blank, the
-# following values are used ($name is automatically replaced with the name of
-# the entity):The $name class, The $name widget, The $name file, is, provides,
-# specifies, contains, represents, a, an and the.
-
-ABBREVIATE_BRIEF       = "The $name class" \
-                         "The $name widget" \
-                         "The $name file" \
-                         is \
-                         provides \
-                         specifies \
-                         contains \
-                         represents \
-                         a \
-                         an \
-                         the
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# doxygen will generate a detailed section even if there is only a brief
-# description.
-# The default value is: NO.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-# The default value is: NO.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
-# before files name in the file list and in the header files. If set to NO the
-# shortest path that makes the file name unique will be used
-# The default value is: YES.
-
-FULL_PATH_NAMES        = YES
-
-# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
-# Stripping is only done if one of the specified strings matches the left-hand
-# part of the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the path to
-# strip.
-#
-# Note that you can specify absolute paths here, but also relative paths, which
-# will be relative from the directory where doxygen is started.
-# This tag requires that the tag FULL_PATH_NAMES is set to YES.
-
-STRIP_FROM_PATH        = ../library/include
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
-# path mentioned in the documentation of a class, which tells the reader which
-# header file to include in order to use a class. If left blank only the name of
-# the header file containing the class definition is used. Otherwise one should
-# specify the list of include paths that are normally passed to the compiler
-# using the -I flag.
-
-STRIP_FROM_INC_PATH    = 
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
-# less readable) file names. This can be useful is your file systems doesn't
-# support long names like on DOS, Mac, or CD-ROM.
-# The default value is: NO.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
-# first line (until the first dot) of a Javadoc-style comment as the brief
-# description. If set to NO, the Javadoc-style will behave just like regular Qt-
-# style comments (thus requiring an explicit @brief command for a brief
-# description.)
-# The default value is: NO.
-
-JAVADOC_AUTOBRIEF      = NO
-
-# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
-# line (until the first dot) of a Qt-style comment as the brief description. If
-# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
-# requiring an explicit \brief command for a brief description.)
-# The default value is: NO.
-
-QT_AUTOBRIEF           = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
-# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
-# a brief description. This used to be the default behavior. The new default is
-# to treat a multi-line C++ comment block as a detailed description. Set this
-# tag to YES if you prefer the old behavior instead.
-#
-# Note that setting this tag to YES also means that rational rose comments are
-# not recognized any more.
-# The default value is: NO.
-
-MULTILINE_CPP_IS_BRIEF = NO
-
-# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
-# documentation from any documented member that it re-implements.
-# The default value is: YES.
-
-INHERIT_DOCS           = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
-# page for each member. If set to NO, the documentation of a member will be part
-# of the file/class/namespace that contains it.
-# The default value is: NO.
-
-SEPARATE_MEMBER_PAGES  = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
-# uses this value to replace tabs by spaces in code fragments.
-# Minimum value: 1, maximum value: 16, default value: 4.
-
-TAB_SIZE               = 4
-
-# This tag can be used to specify a number of aliases that act as commands in
-# the documentation. An alias has the form:
-# name=value
-# For example adding
-# "sideeffect=@par Side Effects:\n"
-# will allow you to put the command \sideeffect (or @sideeffect) in the
-# documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
-
-ALIASES                = 
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              = 
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
-# only. Doxygen will then generate output that is more tailored for C. For
-# instance, some of the names that are used will be different. The list of all
-# members will be omitted, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
-# Python sources only. Doxygen will then generate output that is more tailored
-# for that language. For instance, namespaces will be presented as packages,
-# qualified scopes will look different, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources. Doxygen will then generate output that is tailored for Fortran.
-# The default value is: NO.
-
-OPTIMIZE_FOR_FORTRAN   = NO
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for VHDL.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_VHDL   = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it
-# parses. With this tag you can assign which parser to use for a given
-# extension. Doxygen has a built-in mapping, but you can override or extend it
-# using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
-#
-# Note: For files without extension you can use no_extension as a placeholder.
-#
-# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
-
-EXTENSION_MAPPING      = 
-
-# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
-# according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
-# The output of markdown processing is further processed by doxygen, so you can
-# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
-# case of backward compatibilities issues.
-# The default value is: YES.
-
-MARKDOWN_SUPPORT       = YES
-
-# When enabled doxygen tries to link words that correspond to documented
-# classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by putting a % sign in front of the word or
-# globally by setting AUTOLINK_SUPPORT to NO.
-# The default value is: YES.
-
-AUTOLINK_SUPPORT       = YES
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should set this
-# tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string);
-# versus func(std::string) {}). This also make the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-# The default value is: NO.
-
-BUILTIN_STL_SUPPORT    = NO
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-# The default value is: NO.
-
-CPP_CLI_SUPPORT        = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
-# will parse them like normal C++ but will assume all classes use public instead
-# of private inheritance when no explicit protection keyword is present.
-# The default value is: NO.
-
-SIP_SUPPORT            = NO
-
-# For Microsoft's IDL there are propget and propput attributes to indicate
-# getter and setter methods for a property. Setting this option to YES will make
-# doxygen to replace the get and set methods by a property in the documentation.
-# This will only work if the methods are indeed getting or setting a simple
-# type. If this is not the case, or you want to show the methods anyway, you
-# should set this option to NO.
-# The default value is: YES.
-
-IDL_PROPERTY_SUPPORT   = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-# The default value is: NO.
-
-DISTRIBUTE_GROUP_DOC   = YES
-
-# If one adds a struct or class to a group and this option is enabled, then also
-# any nested class or struct is added to the same group. By default this option
-# is disabled and one has to add nested compounds explicitly via \ingroup.
-# The default value is: NO.
-
-GROUP_NESTED_COMPOUNDS = NO
-
-# Set the SUBGROUPING tag to YES to allow class member groups of the same type
-# (for instance a group of public functions) to be put as a subgroup of that
-# type (e.g. under the Public Functions section). Set it to NO to prevent
-# subgrouping. Alternatively, this can be done per class using the
-# \nosubgrouping command.
-# The default value is: YES.
-
-SUBGROUPING            = YES
-
-# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
-# are shown inside the group in which they are included (e.g. using \ingroup)
-# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
-# and RTF).
-#
-# Note that this feature does not work in combination with
-# SEPARATE_MEMBER_PAGES.
-# The default value is: NO.
-
-INLINE_GROUPED_CLASSES = NO
-
-# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
-# with only public data fields or simple typedef fields will be shown inline in
-# the documentation of the scope in which they are defined (i.e. file,
-# namespace, or group documentation), provided this scope is documented. If set
-# to NO, structs, classes, and unions are shown on a separate page (for HTML and
-# Man pages) or section (for LaTeX and RTF).
-# The default value is: NO.
-
-INLINE_SIMPLE_STRUCTS  = NO
-
-# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
-# enum is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically be
-# useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-# The default value is: NO.
-
-TYPEDEF_HIDES_STRUCT   = YES
-
-# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
-# cache is used to resolve symbols given their name and scope. Since this can be
-# an expensive process and often the same symbol appears multiple times in the
-# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
-# doxygen will become slower. If the cache is too large, memory is wasted. The
-# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
-# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
-# symbols. At the end of a run doxygen will report the cache usage and suggest
-# the optimal cache size from a speed point of view.
-# Minimum value: 0, maximum value: 9, default value: 0.
-
-LOOKUP_CACHE_SIZE      = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-SHOW_NAMESPACES        = NO
-
-# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
-# documentation are documented, even if no documentation was available. Private
-# class members and static file members will be hidden unless the
-# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
-# Note: This will also disable the warnings about undocumented members that are
-# normally produced when WARNINGS is set to YES.
-# The default value is: NO.
-
-EXTRACT_ALL            = NO
-
-# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
-# be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PRIVATE        = NO
-
-# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
-# scope will be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PACKAGE        = NO
-
-# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
-# included in the documentation.
-# The default value is: NO.
-
-EXTRACT_STATIC         = NO
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
-# locally in source files will be included in the documentation. If set to NO,
-# only classes defined in header files are included. Does not have any effect
-# for Java sources.
-# The default value is: YES.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# This flag is only useful for Objective-C code. If set to YES, local methods,
-# which are defined in the implementation section but not in the interface are
-# included in the documentation. If set to NO, only methods in the interface are
-# included.
-# The default value is: NO.
-
-EXTRACT_LOCAL_METHODS  = NO
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base name of
-# the file that contains the anonymous namespace. By default anonymous namespace
-# are hidden.
-# The default value is: NO.
-
-EXTRACT_ANON_NSPACES   = NO
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
-# undocumented members inside documented classes or files. If set to NO these
-# members will be included in the various overviews, but no documentation
-# section is generated. This option has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy. If set
-# to NO, these classes will be included in the various overviews. This option
-# has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
-# The default value is: NO.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
-# documentation blocks found inside the body of a function. If set to NO, these
-# blocks will be appended to the function's detailed documentation block.
-# The default value is: NO.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation that is typed after a
-# \internal command is included. If the tag is set to NO then the documentation
-# will be excluded. Set it to YES to include the internal documentation.
-# The default value is: NO.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-# The default value is: system dependent.
-
-CASE_SENSE_NAMES       = NO
-
-# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
-# their full class and namespace scopes in the documentation. If set to YES, the
-# scope will be hidden.
-# The default value is: NO.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
-# append additional text to a page's title, such as Class Reference. If set to
-# YES the compound reference will be hidden.
-# The default value is: NO.
-
-HIDE_COMPOUND_REFERENCE= NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
-# the files that are included by a file in the documentation of that file.
-# The default value is: YES.
-
-SHOW_INCLUDE_FILES     = YES
-
-# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
-# grouped member an include statement to the documentation, telling the reader
-# which file to include in order to use the member.
-# The default value is: NO.
-
-SHOW_GROUPED_MEMB_INC  = NO
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
-# files with double quotes in the documentation rather than with sharp brackets.
-# The default value is: NO.
-
-FORCE_LOCAL_INCLUDES   = NO
-
-# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
-# documentation for inline members.
-# The default value is: YES.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
-# (detailed) documentation of file and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order.
-# The default value is: YES.
-
-SORT_MEMBER_DOCS       = YES
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
-# descriptions of file, namespace and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order. Note that
-# this will also influence the order of the classes in the class list.
-# The default value is: NO.
-
-SORT_BRIEF_DOCS        = NO
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
-# (brief and detailed) documentation of class members so that constructors and
-# destructors are listed first. If set to NO the constructors will appear in the
-# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
-# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
-# member documentation.
-# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
-# detailed member documentation.
-# The default value is: NO.
-
-SORT_MEMBERS_CTORS_1ST = NO
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
-# of group names into alphabetical order. If set to NO the group names will
-# appear in their defined order.
-# The default value is: NO.
-
-SORT_GROUP_NAMES       = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
-# fully-qualified names, including namespaces. If set to NO, the class list will
-# be sorted only by class name, not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the alphabetical
-# list.
-# The default value is: NO.
-
-SORT_BY_SCOPE_NAME     = NO
-
-# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
-# type resolution of all parameters of a function it will reject a match between
-# the prototype and the implementation of a member function even if there is
-# only one candidate or it is obvious which candidate to choose by doing a
-# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
-# accept a match between prototype and implementation in such cases.
-# The default value is: NO.
-
-STRICT_PROTO_MATCHING  = NO
-
-# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
-# list. This list is created by putting \todo commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
-# list. This list is created by putting \test commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
-# list. This list is created by putting \bug commands in the documentation.
-# The default value is: YES.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
-# the deprecated list. This list is created by putting \deprecated commands in
-# the documentation.
-# The default value is: YES.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional documentation
-# sections, marked by \if <section_label> ... \endif and \cond <section_label>
-# ... \endcond blocks.
-
-ENABLED_SECTIONS       = 
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
-# initial value of a variable or macro / define can have for it to appear in the
-# documentation. If the initializer consists of more lines than specified here
-# it will be hidden. Use a value of 0 to hide initializers completely. The
-# appearance of the value of individual variables and macros / defines can be
-# controlled using \showinitializer or \hideinitializer command in the
-# documentation regardless of this setting.
-# Minimum value: 0, maximum value: 10000, default value: 30.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
-# the bottom of the documentation of classes and structs. If set to YES, the
-# list will mention the files that were used to generate the documentation.
-# The default value is: YES.
-
-SHOW_USED_FILES        = YES
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
-# will remove the Files entry from the Quick Index and from the Folder Tree View
-# (if specified).
-# The default value is: YES.
-
-SHOW_FILES             = YES
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
-# page. This will remove the Namespaces entry from the Quick Index and from the
-# Folder Tree View (if specified).
-# The default value is: YES.
-
-SHOW_NAMESPACES        = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command command input-file, where command is the value of the
-# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
-# by doxygen. Whatever the program writes to standard output is used as the file
-# version. For an example see the documentation.
-
-FILE_VERSION_FILTER    = 
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
-# by doxygen. The layout file controls the global structure of the generated
-# output files in an output format independent way. To create the layout file
-# that represents doxygen's defaults, run doxygen with the -l option. You can
-# optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
-#
-# Note that if you run doxygen from a directory containing a file called
-# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
-# tag is left empty.
-
-LAYOUT_FILE            = 
-
-# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
-# the reference definitions. This must be a list of .bib files. The .bib
-# extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
-# For LaTeX the style of the bibliography can be controlled using
-# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
-# search path. See also \cite for info how to create references.
-
-CITE_BIB_FILES         = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated to
-# standard output by doxygen. If QUIET is set to YES this implies that the
-# messages are off.
-# The default value is: NO.
-
-QUIET                  = NO
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
-# this implies that the warnings are on.
-#
-# Tip: Turn warnings on while writing the documentation.
-# The default value is: YES.
-
-WARNINGS               = YES
-
-# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
-# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
-# will automatically be disabled.
-# The default value is: YES.
-
-WARN_IF_UNDOCUMENTED   = YES
-
-# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
-# The default value is: YES.
-
-WARN_IF_DOC_ERROR      = YES
-
-# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
-# are documented, but have no documentation for their parameters or return
-# value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
-# The default value is: NO.
-
-WARN_NO_PARAMDOC       = NO
-
-# The WARN_FORMAT tag determines the format of the warning messages that doxygen
-# can produce. The string should contain the $file, $line, and $text tags, which
-# will be replaced by the file and line number from which the warning originated
-# and the warning text. Optionally the format may contain $version, which will
-# be replaced by the version of the file (if it could be obtained via
-# FILE_VERSION_FILTER)
-# The default value is: $file:$line: $text.
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning and error
-# messages should be written. If left blank the output is written to standard
-# error (stderr).
-
-WARN_LOGFILE           = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag is used to specify the files and/or directories that contain
-# documented source files. You may enter file names like myfile.cpp or
-# directories like /usr/src/myproject. Separate the files or directories with
-# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
-# Note: If this tag is empty the current directory is searched.
-
-INPUT                  = ../library/include/rocfft.h
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
-# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
-# The default value is: UTF-8.
-
-INPUT_ENCODING         = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
-# *.h) to filter out the source-files in the directories.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# read by doxygen.
-#
-# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
-# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd,
-# *.vhdl, *.ucf, *.qsf, *.as and *.js.
-
-FILE_PATTERNS          = *.c \
-                         *.cc \
-                         *.cxx \
-                         *.cpp \
-                         *.c++ \
-                         *.java \
-                         *.ii \
-                         *.ixx \
-                         *.ipp \
-                         *.i++ \
-                         *.inl \
-                         *.idl \
-                         *.ddl \
-                         *.odl \
-                         *.h \
-                         *.hh \
-                         *.hxx \
-                         *.hpp \
-                         *.h++ \
-                         *.cs \
-                         *.d \
-                         *.php \
-                         *.php4 \
-                         *.php5 \
-                         *.phtml \
-                         *.inc \
-                         *.m \
-                         *.markdown \
-                         *.md \
-                         *.mm \
-                         *.dox \
-                         *.py \
-                         *.f90 \
-                         *.f \
-                         *.for \
-                         *.tcl \
-                         *.vhd \
-                         *.vhdl \
-                         *.ucf \
-                         *.qsf \
-                         *.as \
-                         *.js
-
-# The RECURSIVE tag can be used to specify whether or not subdirectories should
-# be searched for input files as well.
-# The default value is: NO.
-
-RECURSIVE              = NO
-
-# The EXCLUDE tag can be used to specify files and/or directories that should be
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-#
-# Note that relative paths are relative to the directory from which doxygen is
-# run.
-
-EXCLUDE                = 
-
-# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
-# directories that are symbolic links (a Unix file system feature) are excluded
-# from the input.
-# The default value is: NO.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories.
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories for example use the pattern */test/*
-
-EXCLUDE_PATTERNS       = 
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
-
-EXCLUDE_SYMBOLS        = 
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or directories
-# that contain example code fragments that are included (see the \include
-# command).
-
-EXAMPLE_PATH           = 
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank all
-# files are included.
-
-EXAMPLE_PATTERNS       = *
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude commands
-# irrespective of the value of the RECURSIVE tag.
-# The default value is: NO.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or directories
-# that contain images that are to be included in the documentation (see the
-# \image command).
-
-IMAGE_PATH             = 
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command:
-#
-# <filter> <input-file>
-#
-# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
-# name of an input file. Doxygen will then use the output that the filter
-# program writes to standard output. If FILTER_PATTERNS is specified, this tag
-# will be ignored.
-#
-# Note that the filter must not add or remove lines; it is applied before the
-# code is scanned, but not when the output code is generated. If lines are added
-# or removed, the anchors will not be placed correctly.
-
-INPUT_FILTER           = 
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis. Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match. The filters are a list of the form: pattern=filter
-# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
-# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
-# patterns match the file name, INPUT_FILTER is applied.
-
-FILTER_PATTERNS        = 
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER) will also be used to filter the input files that are used for
-# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
-# The default value is: NO.
-
-FILTER_SOURCE_FILES    = NO
-
-# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
-# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
-# it is also possible to disable source filtering for a specific pattern using
-# *.ext= (so without naming a filter).
-# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
-
-FILTER_SOURCE_PATTERNS = 
-
-# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
-# is part of the input, its contents will be placed on the main page
-# (index.html). This can be useful if you have a project on for instance GitHub
-# and want to reuse the introduction page also for the doxygen output.
-
-USE_MDFILE_AS_MAINPAGE = ../README.md
-
-#---------------------------------------------------------------------------
-# Configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
-# generated. Documented entities will be cross-referenced with these sources.
-#
-# Note: To get rid of all source code in the generated output, make sure that
-# also VERBATIM_HEADERS is set to NO.
-# The default value is: NO.
-
-SOURCE_BROWSER         = NO
-
-# Setting the INLINE_SOURCES tag to YES will include the body of functions,
-# classes and enums directly into the documentation.
-# The default value is: NO.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
-# special comment blocks from generated source code fragments. Normal C, C++ and
-# Fortran comments will always remain visible.
-# The default value is: YES.
-
-STRIP_CODE_COMMENTS    = YES
-
-# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
-# The default value is: NO.
-
-REFERENCED_BY_RELATION = NO
-
-# If the REFERENCES_RELATION tag is set to YES then for each documented function
-# all documented entities called/used by that function will be listed.
-# The default value is: NO.
-
-REFERENCES_RELATION    = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
-# to YES then the hyperlinks from functions in REFERENCES_RELATION and
-# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
-# link to the documentation.
-# The default value is: YES.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
-# source code will show a tooltip with additional information such as prototype,
-# brief description and links to the definition and documentation. Since this
-# will make the HTML file larger and loading of large files a bit slower, you
-# can opt to disable this feature.
-# The default value is: YES.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-SOURCE_TOOLTIPS        = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code will
-# point to the HTML generated by the htags(1) tool instead of doxygen built-in
-# source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
-# 4.8.6 or higher.
-#
-# To use it do the following:
-# - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
-# - Make sure the INPUT points to the root of the source tree
-# - Run doxygen as normal
-#
-# Doxygen will invoke htags (and that will in turn invoke gtags), so these
-# tools must be available from the command line (i.e. in the search path).
-#
-# The result: instead of the source browser generated by doxygen, the links to
-# source code will now point to the output of htags.
-# The default value is: NO.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-USE_HTAGS              = NO
-
-# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
-# verbatim copy of the header file for each class for which an include is
-# specified. Set to NO to disable this.
-# See also: Section \class.
-# The default value is: YES.
-
-VERBATIM_HEADERS       = YES
-
-# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
-# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
-# cost of reduced performance. This can be particularly helpful with template
-# rich C++ code for which doxygen's built-in parser lacks the necessary type
-# information.
-# Note: The availability of this option depends on whether or not doxygen was
-# compiled with the --with-libclang option.
-# The default value is: NO.
-
-CLANG_ASSISTED_PARSING = NO
-
-# If clang assisted parsing is enabled you can provide the compiler with command
-# line options that you would normally use when invoking the compiler. Note that
-# the include paths will already be set by doxygen for the files and directories
-# specified with INPUT and INCLUDE_PATH.
-# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
-
-CLANG_OPTIONS          = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
-# compounds will be generated. Enable this if the project contains a lot of
-# classes, structs, unions or interfaces.
-# The default value is: YES.
-
-ALPHABETICAL_INDEX     = YES
-
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-IGNORE_PREFIX          = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
-# The default value is: YES.
-
-GENERATE_HTML          = YES
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
-# generated HTML page (for example: .htm, .php, .asp).
-# The default value is: .html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
-# each generated HTML page. If the tag is left blank doxygen will generate a
-# standard header.
-#
-# To get valid HTML the header file that includes any scripts and style sheets
-# that doxygen needs, which is dependent on the configuration options used (e.g.
-# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
-# default header using
-# doxygen -w html new_header.html new_footer.html new_stylesheet.css
-# YourConfigFile
-# and then modify the file new_header.html. See also section "Doxygen usage"
-# for information on how to generate the default header that doxygen normally
-# uses.
-# Note: The header is subject to change so you typically have to regenerate the
-# default header when upgrading to a newer version of doxygen. For a description
-# of the possible markers and block names see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_HEADER            = 
-
-# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
-# generated HTML page. If the tag is left blank doxygen will generate a standard
-# footer. See HTML_HEADER for more information on how to generate a default
-# footer and what special commands can be used inside the footer. See also
-# section "Doxygen usage" for information on how to generate the default footer
-# that doxygen normally uses.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FOOTER            = 
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
-# sheet that is used by each HTML page. It can be used to fine-tune the look of
-# the HTML output. If left blank doxygen will generate a default style sheet.
-# See also section "Doxygen usage" for information on how to generate the style
-# sheet that doxygen normally uses.
-# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
-# it is more robust and this tag (HTML_STYLESHEET) will in the future become
-# obsolete.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_STYLESHEET        = 
-
-# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# cascading style sheets that are included after the standard style sheets
-# created by doxygen. Using this option one can overrule certain style aspects.
-# This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefore more robust against future updates.
-# Doxygen will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list). For an example see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_STYLESHEET  = 
-
-# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the HTML output directory. Note
-# that these files will be copied to the base HTML output directory. Use the
-# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
-# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
-# files will be copied as-is; there are no commands or markers available.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_FILES       = 
-
-# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
-# will adjust the colors in the style sheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
-# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
-# purple, and 360 is red again.
-# Minimum value: 0, maximum value: 359, default value: 220.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_HUE    = 220
-
-# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
-# value of 255 will produce the most vivid colors.
-# Minimum value: 0, maximum value: 255, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_SAT    = 100
-
-# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
-# luminance component of the colors in the HTML output. Values below 100
-# gradually make the output lighter, whereas values above 100 make the output
-# darker. The value divided by 100 is the actual gamma applied, so 80 represents
-# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
-# change the gamma.
-# Minimum value: 40, maximum value: 240, default value: 80.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_GAMMA  = 80
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_DYNAMIC_SECTIONS  = NO
-
-# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
-# shown in the various tree structured indices initially; the user can expand
-# and collapse entries dynamically later on. Doxygen will expand the tree to
-# such a level that at most the specified number of entries are visible (unless
-# a fully collapsed tree already exceeds this amount). So setting the number of
-# entries 1 will produce a full collapsed tree by default. 0 is a special value
-# representing an infinite number of entries and will result in a full expanded
-# tree by default.
-# Minimum value: 0, maximum value: 9999, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_INDEX_NUM_ENTRIES = 100
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files will be
-# generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_DOCSET        = NO
-
-# This tag determines the name of the docset feed. A documentation feed provides
-# an umbrella under which multiple documentation sets from a single provider
-# (such as a company or product suite) can be grouped.
-# The default value is: Doxygen generated docs.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-
-# This tag specifies a string that should uniquely identify the documentation
-# set bundle. This should be a reverse domain-name style string, e.g.
-# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-
-# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
-# the documentation publisher. This should be a reverse domain-name style
-# string, e.g. com.mycompany.MyDocSet.documentation.
-# The default value is: org.doxygen.Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-
-# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
-# The default value is: Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_NAME  = Publisher
-
-# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
-# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
-# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
-#
-# The HTML Help Workshop contains a compiler that can convert all HTML output
-# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
-# files are now used as the Windows 98 help format, and will replace the old
-# Windows help format (.hlp) on all Windows platforms in the future. Compressed
-# HTML files also contain an index, a table of contents, and you can search for
-# words in the documentation. The HTML workshop also contains a viewer for
-# compressed HTML files.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_HTMLHELP      = NO
-
-# The CHM_FILE tag can be used to specify the file name of the resulting .chm
-# file. You can add a path in front of the file if the result should not be
-# written to the html output directory.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_FILE               = 
-
-# The HHC_LOCATION tag can be used to specify the location (absolute path
-# including file name) of the HTML help compiler (hhc.exe). If non-empty,
-# doxygen will try to run the HTML help compiler on the generated index.hhp.
-# The file has to be specified with full path.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-HHC_LOCATION           = 
-
-# The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-GENERATE_CHI           = NO
-
-# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
-# and project file content.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_INDEX_ENCODING     = 
-
-# The BINARY_TOC flag controls whether a binary table of contents is generated
-# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
-# enables the Previous and Next buttons.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members to
-# the table of contents of the HTML help documentation and to the tree view.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-TOC_EXPAND             = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
-# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
-# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
-# (.qch) of the generated HTML documentation.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_QHP           = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
-# the file name of the resulting .qch file. The path specified is relative to
-# the HTML output folder.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QCH_FILE               = 
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
-# Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_NAMESPACE          = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
-# Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
-# The default value is: doc.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_VIRTUAL_FOLDER     = doc
-
-# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
-# filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_NAME   = 
-
-# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
-# custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_ATTRS  = 
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
-# project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_SECT_FILTER_ATTRS  = 
-
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHG_LOCATION           = 
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
-# generated, together with the HTML files, they form an Eclipse help plugin. To
-# install this plugin and make it available under the help contents menu in
-# Eclipse, the contents of the directory containing the HTML and XML files needs
-# to be copied into the plugins directory of eclipse. The name of the directory
-# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
-# After copying Eclipse needs to be restarted before the help appears.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_ECLIPSEHELP   = NO
-
-# A unique identifier for the Eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have this
-# name. Each documentation set should have its own identifier.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
-
-ECLIPSE_DOC_ID         = org.doxygen.Project
-
-# If you want full control over the layout of the generated HTML pages it might
-# be necessary to disable the index and replace it with your own. The
-# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
-# of each HTML page. A value of NO enables the index and the value YES disables
-# it. Since the tabs in the index contain the same information as the navigation
-# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-DISABLE_INDEX          = NO
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information. If the tag
-# value is set to YES, a side panel will be generated containing a tree-like
-# index structure (just like the one that is generated for HTML Help). For this
-# to work a browser that supports JavaScript, DHTML, CSS and frames is required
-# (i.e. any modern browser). Windows users are probably better off using the
-# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_TREEVIEW      = NO
-
-# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
-# doxygen will group on one line in the generated HTML documentation.
-#
-# Note that a value of 0 will completely suppress the enum values from appearing
-# in the overview section.
-# Minimum value: 0, maximum value: 20, default value: 4.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-ENUM_VALUES_PER_LINE   = 1
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
-# to set the initial width (in pixels) of the frame in which the tree is shown.
-# Minimum value: 0, maximum value: 1500, default value: 250.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-TREEVIEW_WIDTH         = 250
-
-# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
-# external symbols imported via tag files in a separate window.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-EXT_LINKS_IN_WINDOW    = NO
-
-# Use this tag to change the font size of LaTeX formulas included as images in
-# the HTML documentation. When you change the font size after a successful
-# doxygen run you need to manually remove any form_*.png images from the HTML
-# output directory to force them to be regenerated.
-# Minimum value: 8, maximum value: 50, default value: 10.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_FONTSIZE       = 10
-
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
-# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
-# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
-# installed or if you want to formulas look prettier in the HTML output. When
-# enabled you may also need to install MathJax separately and configure the path
-# to it using the MATHJAX_RELPATH option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-USE_MATHJAX            = YES
-
-# When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
-# Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
-# The default value is: HTML-CSS.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_FORMAT         = HTML-CSS
-
-# When MathJax is enabled you need to specify the location relative to the HTML
-# output directory using the MATHJAX_RELPATH option. The destination directory
-# should contain the MathJax.js script. For instance, if the mathjax directory
-# is located at the same level as the HTML output directory, then
-# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
-# Content Delivery Network so you can quickly see the result without installing
-# MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-
-# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
-# extension names that should be enabled during MathJax rendering. For example
-# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_EXTENSIONS     = 
-
-# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
-# of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
-# example see the documentation.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_CODEFILE       = 
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
-# the HTML output. The underlying search engine uses javascript and DHTML and
-# should work on any modern browser. Note that when using HTML help
-# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
-# there is already a search function so this one should typically be disabled.
-# For large projects the javascript based search engine can be slow, then
-# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
-# search using the keyboard; to jump to the search box use <access key> + S
-# (what the <access key> is depends on the OS and browser, but it is typically
-# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
-# key> to jump into the search results window, the results can be navigated
-# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
-# the search. The filter options can be selected when the cursor is inside the
-# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
-# to select a filter and <Enter> or <escape> to activate or cancel the filter
-# option.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-SEARCHENGINE           = YES
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
-# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
-# setting. When disabled, doxygen will generate a PHP script for searching and
-# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
-# and searching needs to be provided by external tools. See the section
-# "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SERVER_BASED_SEARCH    = NO
-
-# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
-# script for searching. Instead the search results are written to an XML file
-# which needs to be processed by an external indexer. Doxygen will invoke an
-# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
-# search results.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
-#
-# See the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH        = NO
-
-# The SEARCHENGINE_URL should point to a search engine hosted by a web server
-# which will return the search results when EXTERNAL_SEARCH is enabled.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHENGINE_URL       = 
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
-# search data is written to a file for indexing by an external tool. With the
-# SEARCHDATA_FILE tag the name of this file can be specified.
-# The default file is: searchdata.xml.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHDATA_FILE        = searchdata.xml
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
-# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
-# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
-# projects and redirect the results back to the right project.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH_ID     = 
-
-# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
-# projects other than the one defined by this configuration file, but that are
-# all added to the same external search index. Each project needs to have a
-# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
-# to a relative location where the documentation can be found. The format is:
-# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTRA_SEARCH_MAPPINGS  = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
-# The default value is: YES.
-
-GENERATE_LATEX         = NO
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_OUTPUT           = latex
-
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
-# invoked.
-#
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_CMD_NAME         = latex
-
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
-# index for LaTeX.
-# The default file is: makeindex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-MAKEINDEX_CMD_NAME     = makeindex
-
-# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-COMPACT_LATEX          = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used by the
-# printer.
-# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
-# 14 inches) and executive (7.25 x 10.5 inches).
-# The default value is: a4.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PAPER_TYPE             = a4
-
-# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
-# that should be included in the LaTeX output. The package can be specified just
-# by its name or with the correct syntax as to be used with the LaTeX
-# \usepackage command. To get the times font for instance you can specify :
-# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
-# To use the option intlimits with the amsmath package you can specify:
-# EXTRA_PACKAGES=[intlimits]{amsmath}
-# If left blank no extra packages will be included.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-EXTRA_PACKAGES         = 
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
-#
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
-# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
-# string, for the replacement values of the other commands the user is referred
-# to HTML_HEADER.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HEADER           = 
-
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer. See
-# LATEX_HEADER for more information on how to generate a default footer and what
-# special commands can be used inside the footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_FOOTER           = 
-
-# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# LaTeX style sheets that are included after the standard style sheets created
-# by doxygen. Using this option one can overrule certain style aspects. Doxygen
-# will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list).
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_STYLESHEET = 
-
-# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the LATEX_OUTPUT output
-# directory. Note that the files will be copied as-is; there are no commands or
-# markers available.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_FILES      = 
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
-# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
-# contain links (just like the HTML output) instead of page references. This
-# makes the output suitable for online browsing using a PDF viewer.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PDF_HYPERLINKS         = YES
-
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-USE_PDFLATEX           = YES
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BATCHMODE        = NO
-
-# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
-# index chapters (such as File Index, Compound Index, etc.) in the output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HIDE_INDICES     = NO
-
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
-# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
-# bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
-# The default value is: plain.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BIB_STYLE        = plain
-
-#---------------------------------------------------------------------------
-# Configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
-# RTF output is optimized for Word 97 and may not look too pretty with other RTF
-# readers/editors.
-# The default value is: NO.
-
-GENERATE_RTF           = YES
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: rtf.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_OUTPUT             = rtf
-
-# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-COMPACT_RTF            = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
-# contain hyperlink fields. The RTF file will contain links (just like the HTML
-# output) instead of page references. This makes the output suitable for online
-# browsing using Word or some other Word compatible readers that support those
-# fields.
-#
-# Note: WordPad (write) and others do not support links.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_HYPERLINKS         = YES
-
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
-#
-# See also section "Doxygen usage" for information on how to generate the
-# default style sheet that doxygen normally uses.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_STYLESHEET_FILE    = 
-
-# Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_EXTENSIONS_FILE    = 
-
-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
-# with syntax highlighting in the RTF output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_SOURCE_CODE        = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
-# classes and files.
-# The default value is: NO.
-
-GENERATE_MAN           = NO
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it. A directory man3 will be created inside the directory specified by
-# MAN_OUTPUT.
-# The default directory is: man.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_OUTPUT             = man
-
-# The MAN_EXTENSION tag determines the extension that is added to the generated
-# man pages. In case the manual section does not start with a number, the number
-# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
-# optional.
-# The default value is: .3.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_EXTENSION          = .3
-
-# The MAN_SUBDIR tag determines the name of the directory created within
-# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
-# MAN_EXTENSION with the initial . removed.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_SUBDIR             = 
-
-# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
-# will generate one additional man file for each entity documented in the real
-# man page(s). These additional files only source the real man page, but without
-# them the man command would be unable to find the correct page.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_LINKS              = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
-# captures the structure of the code including all documentation.
-# The default value is: NO.
-
-GENERATE_XML           = YES 
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: xml.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_OUTPUT             = xml
-
-# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
-# listings (including syntax highlighting and cross-referencing information) to
-# the XML output. Note that enabling this will significantly increase the size
-# of the XML output.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_PROGRAMLISTING     = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the DOCBOOK output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
-# that can be used to generate PDF.
-# The default value is: NO.
-
-GENERATE_DOCBOOK       = NO
-
-# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
-# front of it.
-# The default directory is: docbook.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_OUTPUT         = docbook
-
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
-# program listings (including syntax highlighting and cross-referencing
-# information) to the DOCBOOK output. Note that enabling this will significantly
-# increase the size of the DOCBOOK output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_PROGRAMLISTING = NO
-
-#---------------------------------------------------------------------------
-# Configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_AUTOGEN_DEF   = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
-# file that captures the structure of the code including all documentation.
-#
-# Note that this feature is still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_PERLMOD       = NO
-
-# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
-# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
-# output from the Perl module output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_LATEX          = NO
-
-# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
-# formatted so it can be parsed by a human reader. This is useful if you want to
-# understand what is going on. On the other hand, if this tag is set to NO, the
-# size of the Perl module output will be much smaller and Perl will parse it
-# just the same.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_PRETTY         = YES
-
-# The names of the make variables in the generated doxyrules.make file are
-# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
-# so different doxyrules.make files included by the same Makefile don't
-# overwrite each other's variables.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_MAKEVAR_PREFIX = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
-# C-preprocessor directives found in the sources and include files.
-# The default value is: YES.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
-# in the source code. If set to NO, only conditional compilation will be
-# performed. Macro expansion can be done in a controlled way by setting
-# EXPAND_ONLY_PREDEF to YES.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-MACRO_EXPANSION        = YES
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
-# the macro expansion is limited to the macros specified with the PREDEFINED and
-# EXPAND_AS_DEFINED tags.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_ONLY_PREDEF     = YES
-
-# If the SEARCH_INCLUDES tag is set to YES, the include files in the
-# INCLUDE_PATH will be searched if a #include is found.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SEARCH_INCLUDES        = NO
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by the
-# preprocessor.
-# This tag requires that the tag SEARCH_INCLUDES is set to YES.
-
-INCLUDE_PATH           = 
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will be
-# used.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-INCLUDE_FILE_PATTERNS  = 
-
-# The PREDEFINED tag can be used to specify one or more macro names that are
-# defined before the preprocessor is started (similar to the -D option of e.g.
-# gcc). The argument of the tag is a list of macros of the form: name or
-# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
-# is assumed. To prevent a macro definition from being undefined via #undef or
-# recursively expanded use the := operator instead of the = operator.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-PREDEFINED             = __attribute__(x)= \
-                         __inline= \
-                         ROCFFT_EXPORT=
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
-# tag can be used to specify a list of macro names that should be expanded. The
-# macro definition that is found in the sources will be used. Use the PREDEFINED
-# tag if you want to use a different macro definition that overrules the
-# definition found in the source code.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_AS_DEFINED      = 
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
-# remove all references to function-like macros that are alone on a line, have
-# an all uppercase name, and do not end with a semicolon. Such function macros
-# are typically used for boiler-plate code, and will confuse the parser if not
-# removed.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SKIP_FUNCTION_MACROS   = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES tag can be used to specify one or more tag files. For each tag
-# file the location of the external documentation should be added. The format of
-# a tag file without this location is as follows:
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where loc1 and loc2 can be relative or absolute paths or URLs. See the
-# section "Linking to external documentation" for more information about the use
-# of tag files.
-# Note: Each tag file must have a unique name (where the name does NOT include
-# the path). If a tag file is not located in the directory in which doxygen is
-# run, you must also specify the path to the tagfile here.
-
-TAGFILES               = 
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
-# tag file that is based on the input files it reads. See section "Linking to
-# external documentation" for more information about the usage of tag files.
-
-GENERATE_TAGFILE       = 
-
-# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
-# the class index. If set to NO, only the inherited external classes will be
-# listed.
-# The default value is: NO.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will be
-# listed.
-# The default value is: YES.
-
-EXTERNAL_GROUPS        = YES
-
-# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
-# the related pages index. If set to NO, only the current project's pages will
-# be listed.
-# The default value is: YES.
-
-EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = NO
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            = 
-
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               = 
-
-# If set to YES the inheritance and collaboration graphs will hide inheritance
-# and usage relations if the target is undocumented or is not a class.
-# The default value is: YES.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
-# Bell Labs. The other options in this section have no effect if this option is
-# set to NO
-# The default value is: NO.
-
-HAVE_DOT               = NO
-
-# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
-# to run in parallel. When set to 0 doxygen will base this on the number of
-# processors available in the system. You can set it explicitly to a value
-# larger than 0 to get control over the balance between CPU load and processing
-# speed.
-# Minimum value: 0, maximum value: 32, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_NUM_THREADS        = 0
-
-# When you want a differently looking font in the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTNAME           = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTSIZE           = 10
-
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTPATH           = 
-
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
-# graph for each documented class showing the direct and indirect implementation
-# dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-COLLABORATION_GRAPH    = YES
-
-# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GROUP_GRAPHS           = YES
-
-# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LOOK               = NO
-
-# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
-# class node. If there are many fields or methods and many nodes the graph may
-# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
-# number of items for each type to make the size more manageable. Set this to 0
-# for no limit. Note that the threshold may be exceeded by 50% before the limit
-# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
-# but if the number exceeds 15, the total amount of fields shown is limited to
-# 10.
-# Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LIMIT_NUM_FIELDS   = 10
-
-# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
-# collaboration graphs will show the relations between templates and their
-# instances.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-TEMPLATE_RELATIONS     = NO
-
-# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
-# YES then doxygen will generate a graph for each documented file showing the
-# direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDE_GRAPH          = YES
-
-# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
-# set to YES then doxygen will generate a graph for each documented file showing
-# the direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command. Disabling a call graph can be
-# accomplished by means of the command \hidecallgraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALL_GRAPH             = NO
-
-# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command. Disabling a caller graph can be
-# accomplished by means of the command \hidecallergraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALLER_GRAPH           = NO
-
-# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
-# hierarchy of all classes instead of a textual one.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
-# dependencies a directory has on other directories in a graphical way. The
-# dependency relations are determined by the #include relations between the
-# files in the directories.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DIRECTORY_GRAPH        = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot. For an explanation of the image formats see the section
-# output formats in the documentation of the dot tool (Graphviz (see:
-# http://www.graphviz.org/)).
-# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
-# to make the SVG files visible in IE 9+ (other browsers do not have this
-# requirement).
-# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
-# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
-# png:gdiplus:gdiplus.
-# The default value is: png.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_IMAGE_FORMAT       = png
-
-# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
-# enable generation of interactive SVG images that allow zooming and panning.
-#
-# Note that this requires a modern browser other than Internet Explorer. Tested
-# and working are Firefox, Chrome, Safari, and Opera.
-# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
-# the SVG files visible. Older versions of IE do not have SVG support.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INTERACTIVE_SVG        = NO
-
-# The DOT_PATH tag can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_PATH               = 
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the \dotfile
-# command).
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOTFILE_DIRS           = 
-
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
-
-MSCFILE_DIRS           = 
-
-# The DIAFILE_DIRS tag can be used to specify one or more directories that
-# contain dia files that are included in the documentation (see the \diafile
-# command).
-
-DIAFILE_DIRS           = 
-
-# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
-
-PLANTUML_JAR_PATH      = 
-
-# When using plantuml, the specified paths are searched for files specified by
-# the !include statement in a plantuml block.
-
-PLANTUML_INCLUDE_PATH  = 
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
-# that will be shown in the graph. If the number of nodes in a graph becomes
-# larger than this value, doxygen will truncate the graph, which is visualized
-# by representing a node as a red box. Note that doxygen if the number of direct
-# children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
-# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-# Minimum value: 0, maximum value: 10000, default value: 50.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_GRAPH_MAX_NODES    = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
-# generated by dot. A depth value of 3 means that only nodes reachable from the
-# root by following a path via at most 3 edges will be shown. Nodes that lay
-# further from the root node will be omitted. Note that setting this option to 1
-# or 2 may greatly reduce the computation time needed for large code bases. Also
-# note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-# Minimum value: 0, maximum value: 1000, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10) support
-# this, this feature is disabled by default.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_MULTI_TARGETS      = NO
-
-# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
-# explaining the meaning of the various boxes and arrows in the dot generated
-# graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
-# files that are used to generate the various graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_CLEANUP            = YES
diff -Nru rocfft-5.5.0/docs/Makefile rocfft-5.7.1/docs/Makefile
--- rocfft-5.5.0/docs/Makefile	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/docs/Makefile	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = rocFFT
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff -Nru rocfft-5.5.0/docs/allapi.rst rocfft-5.7.1/docs/allapi.rst
--- rocfft-5.5.0/docs/allapi.rst	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/docs/allapi.rst	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,11 @@
+.. toctree::
+   :maxdepth: 4 
+   :caption: Contents:
+
+=============
+API Reference
+=============
+
+.. doxygenindex::
+
+ 
diff -Nru rocfft-5.5.0/docs/api.rst rocfft-5.7.1/docs/api.rst
--- rocfft-5.5.0/docs/api.rst	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/docs/api.rst	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,113 @@
+.. toctree::
+   :maxdepth: 4 
+   :caption: Contents:
+
+=========
+API Usage
+=========
+
+This section describes usage of the rocFFT library API.
+
+Types
+-----
+
+There are a few data structures that are internal to the library. The pointer types to these
+structures are given below. The user would need to use these types to create handles and pass them
+between different library functions.
+
+.. doxygentypedef:: rocfft_plan
+
+.. doxygentypedef:: rocfft_plan_description
+
+.. doxygentypedef:: rocfft_execution_info
+
+Library Setup and Cleanup
+-------------------------
+
+The following functions deal with initialization and cleanup of the library.
+
+.. doxygenfunction:: rocfft_setup
+
+.. doxygenfunction:: rocfft_cleanup
+
+Plan
+----
+
+The following functions are used to create and destroy plan objects.
+
+.. doxygenfunction:: rocfft_plan_create
+
+.. doxygenfunction:: rocfft_plan_destroy
+
+The following functions are used to query for information after a plan is created.
+
+.. doxygenfunction:: rocfft_plan_get_work_buffer_size
+
+.. doxygenfunction:: rocfft_plan_get_print
+
+Plan description
+----------------
+
+Most of the time, :cpp:func:`rocfft_plan_create` is able to fully
+specify a transform.  Advanced plan details such as strides and
+offsets require creation of a plan description object, which is
+configured and passed to the :cpp:func:`rocfft_plan_create` function.
+
+The plan description object can be safely destroyed after it is given
+to the :cpp:func:`rocfft_plan_create` function.
+
+.. doxygenfunction:: rocfft_plan_description_create
+
+.. doxygenfunction:: rocfft_plan_description_destroy
+
+.. doxygenfunction:: rocfft_plan_description_set_scale_factor
+
+.. doxygenfunction:: rocfft_plan_description_set_data_layout
+
+.. comment doxygenfunction:: rocfft_plan_description_set_devices
+
+Execution
+---------
+
+After a plan has been created, it can be executed using the
+:cpp:func:`rocfft_execute` function,
+to compute a transform on specified data. Aspects of the execution can be controlled and any useful
+information returned to the user.
+
+.. doxygenfunction:: rocfft_execute
+
+Execution info
+--------------
+
+:cpp:func:`rocfft_execute` takes an optional :cpp:type:`rocfft_execution_info` parameter. This parameter encapsulates
+information such as the work buffer and compute stream for the transform.
+
+.. doxygenfunction:: rocfft_execution_info_create
+
+.. doxygenfunction:: rocfft_execution_info_destroy
+
+.. doxygenfunction:: rocfft_execution_info_set_work_buffer
+
+.. comment doxygenfunction:: rocfft_execution_info_set_mode
+
+.. doxygenfunction:: rocfft_execution_info_set_stream
+
+.. comment doxygenfunction:: rocfft_execution_info_get_events
+
+
+Enumerations
+------------
+
+This section provides all the enumerations used.
+
+.. doxygenenum:: rocfft_status
+
+.. doxygenenum:: rocfft_transform_type
+
+.. doxygenenum:: rocfft_precision
+
+.. doxygenenum:: rocfft_result_placement
+
+.. doxygenenum:: rocfft_array_type
+
+.. comment doxygenenum:: rocfft_execution_mode
diff -Nru rocfft-5.5.0/docs/conf.py rocfft-5.7.1/docs/conf.py
--- rocfft-5.5.0/docs/conf.py	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/docs/conf.py	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,8 @@
+from rocm_docs import ROCmDocs
+
+docs_core = ROCmDocs("rocFFT Documentation")
+docs_core.run_doxygen()
+docs_core.setup()
+
+for sphinx_var in ROCmDocs.SPHINX_VARS:
+    globals()[sphinx_var] = getattr(docs_core, sphinx_var)
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/data/images/realfft_1dlen.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/data/images/realfft_1dlen.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/data/images/realfft_ex_n7.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/data/images/realfft_ex_n7.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/data/images/realfft_ex_n8.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/data/images/realfft_ex_n8.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/data/images/realfft_expl_01.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/data/images/realfft_expl_01.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/data/images/realfft_expl_02.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/data/images/realfft_expl_02.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/data/images/realfft_expl_03.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/data/images/realfft_expl_03.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/data/images/realfft_expl_04.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/data/images/realfft_expl_04.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/data/images/realfft_expl_05.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/data/images/realfft_expl_05.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/data/images/realfft_expl_06.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/data/images/realfft_expl_06.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/data/images/realfft_expl_07.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/data/images/realfft_expl_07.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/data/images/realfft_expl_08.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/data/images/realfft_expl_08.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/data/images/realfft_fwdinv.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/data/images/realfft_fwdinv.jpg differ
diff -Nru rocfft-5.5.0/docs/design/bluestein.rst rocfft-5.7.1/docs/design/bluestein.rst
--- rocfft-5.5.0/docs/design/bluestein.rst	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/docs/design/bluestein.rst	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,194 @@
+Bluestein Design Document
+=========================
+
+Copyright and Disclaimer
+------------------------
+
+DISCLAIMER
+
+The information contained herein is for informational purposes only, and is subject to change without notice. While every precaution has been taken in the preparation of this document, it may contain technical inaccuracies, omissions and typographical errors, and AMD is under no obligation to update or otherwise correct this information. Advanced Micro Devices, Inc. makes no representations or warranties with respect to the accuracy or completeness of the contents of this document, and assumes no liability of any kind, including the implied warranties of noninfringement, merchantability or fitness for particular purposes, with respect to the operation or use of AMD hardware, software or other products described herein.  No license, including implied or arising by estoppel, to any intellectual property rights is granted by this document.  Terms and limitations applicable
+to the purchase or use of AMD’s products are as set forth in a signed agreement between the parties or in AMD's Standard Terms and
+Conditions of Sale.
+
+AMD is a trademark of Advanced Micro Devices, Inc.  Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies.
+
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Summary
+-------
+
+This document revisits the Bluestein algorithm for prime-length discrete Fourier transforms (DFTs), and presents its implementation in the rocFFT library. An optimization of the Bluestein algorithm for large length DFTs is introduced. This optimization provide several benefits, including significantly improved performance and the ability to reuse the design to perform fast convolutions without any major design modifications.
+
+
+Background and Notation
+-----------------------
+Let :math:`\mathbf{X} = \mathcal{F}\left\{ \mathbf{x} \right\}` denote the DFT of :math:`\mathbf{x}`, which maps an :math:`N`-length input sequence :math:`\mathbf{x} = \begin{bmatrix} x_0 &  \cdots & x_{N-1} \end{bmatrix}` into an :math:`N`-length output sequence :math:`\mathbf{X} = \begin{bmatrix} X_0 & \cdots & X_{N-1} \end{bmatrix}` with
+
+.. math::
+
+
+  X_k = \sum_{n=0}^{N-1}{x_n e^{-\frac{2 \pi \jmath}{N}nk}}, \qquad k = 0, \ \ldots, \ N-1.
+
+Conversely, let :math:`\mathbf{x} = \mathcal{F}^{-1}\left\{ \mathbf{X} \right\}` denote the inverse DFT, which maps the sequence :math:`\mathbf{X}` into sequence :math:`\mathbf{x}` as follows
+
+.. math::
+
+   x_k = \frac{1}{N}\sum_{n=0}^{N-1}{X_n e^{\frac{2 \pi \jmath}{N}nk}}, \qquad k = 0, \ \ldots, \ N-1.
+
+
+Bluestein Algorithm
+-------------------
+In Bluestein's algorithm, the following identity is considered for the DFT computation
+
+.. math::
+
+   nk = \frac{-(k-n)^2}{2} + \frac{n^2}{2} + \frac{k^2}{2}.
+
+For example, substituting this identity into the DFT equation, the DFT can then be expressed as
+
+.. math::
+
+   X_k = e^{-\frac{\pi \jmath}{N}k^2} \sum_{n=0}^{N-1}{\left( x_n e^{-\frac{\pi \jmath}{N}n^2} \right) e^{\frac{\pi \jmath}{N}  (k-n)^2}{}}, \qquad k = 0, \ \ldots, \ N-1.
+
++++++
+Chirp
++++++
+Bluestein's algorithm is frequently used to compute the DFT, but it can also be used to compute the more general z-transform. This transform is similar to the DFT equation with the difference that the term :math:`e^{-\frac{2\pi \jmath}{N}}` is replaced by :math:`z`, where :math:`z` is an arbitrary complex number. 
+
+Let :math:`\mathbf{c} = \begin{bmatrix} c_0 & \cdots & c_{N-1} \end{bmatrix}` denote an :math:`N` length sequence of the form
+
+.. math::
+
+   c_n = e^{\frac{\pi \jmath}{N}n^2}, \qquad n = 0, \ \ldots, \ N-1.
+
+The sequence :math:`\mathbf{c}`, which is present in Bluestein DFT equation, is also known as chirp because it defines a complex sinusoid of linearly increasing frequency. Bluestein's algorithm is also known as the chirp z-transform for this reason.
+
++++++++++++
+Convolution
++++++++++++
+
+Now let :math:`\left(\mathbf{a} \ast \mathbf{b}\right)_k` for :math:`k = 0, \ \ldots, \ M-1` denote the convolution of two :math:`M`-length input sequences :math:`\mathbf{a} = \begin{bmatrix} a_0 & \cdots a_{M-1} \end{bmatrix}` and :math:`\mathbf{b} = \begin{bmatrix} b_0 & \cdots b_{M-1} \end{bmatrix}` with
+
+.. math::
+
+   \left(\mathbf{a} \ast \mathbf{b} \right)_k = \sum_{m=0}^{M-1}a_m b_{k-m}, \qquad k = 0, \ \ldots, \ M-1.
+
+
+The DFT in the Bluestein DFT equation can be expressed in terms of the convolution sum in the above equation as
+
+.. math::
+
+   X_k = b_k^{-1} \sum_{m=0}^{M-1}{a_m b_{k-m}}, \qquad k = 0, \ \ldots, \ M-1,
+
+with :math:`M=N`, :math:`a_m = x_m / c_m`, and :math:`b_m = c_m` for :math:`m = 0, \ \ldots, \ M-1`.
+
+From the convolution theorem we know that, under suitable conditions, the convolution sum in convolution definition equation can be evaluated by computing the point-wise product of the DFTs of :math:`\mathbf{a}` and :math:`\mathbf{b}` and taking the inverse DFT of the product
+
+.. math::
+
+   \left(\mathbf{a} \ast \mathbf{b} \right) = \mathcal{F}^{-1}\left\{ \mathcal{F}\left\{ \mathbf{a} \right\} \cdot \mathcal{F}\left\{ \mathbf{b} \right\} \right\}. 
+
+Note, however, that Bluestein's DFT equation in terms of the convolution sum cannot be used to directly evaluate the DFT equation under the values of :math:`M`, :math:`a_m` and :math:`b_m` provided.
+
+++++++++++++
+Zero padding
+++++++++++++
+Consider instead that the DFT in the Bluestein DFT convolution equation is evaluated with
+
+.. math::
+
+   M \geq 2N-1
+
+and the sequences :math:`\mathbf{a}` and :math:`\mathbf{b}` are zero-padded as follows
+
+.. math::
+
+   a_m = \begin{cases} x_n / c_n& \text{for $n = 0, \ \ldots, \ N-1$},\\ 0 & \text{otherwise} \end{cases} 
+
+and
+
+.. math::
+
+   b_m = \begin{cases} c_n& \qquad \text{for $n = 0, \ \ldots, \ N-1$ \ and $n = M - N + 1, \ \ldots, \ M - 1$},\\ 0 & \qquad \text{otherwise.} \end{cases}
+
+In Bluestein's algorithm, the above conditions ensure that the convolution theorem holds and, therefore, the Bluestein's DFT equation can be properly employed for the DFT computation.
+
++++++++++++++++++
+DFT via Bluestein
++++++++++++++++++
+
+Based on the two conditions for the sequences :math:`\mathbf{a}` and :math:`\mathbf{b}` obtained above, and the convolution theorem, the DFT can be computed as follows in Bluestein's algorithm
+
+.. math::
+
+   X_k = b_k^{-1} \mathcal{F}^{-1}\left\{ \mathcal{F}\left\{ \mathbf{a} \right\} \cdot \mathcal{F}\left\{ \mathbf{b} \right\} \right\}, \qquad k = 0, \ \ldots, \ N-1.
+
+There are quite a few operations involved in this computation. More specifically, computation of the chirp sequence, two :math:`N`-length plus one :math:`M`-length point-wise multiplications, zero-padding of two :math:`M`-length sequences, and two forward DFTs of length :math:`M` plus an inverse DFT also of length :math:`M`. 
+
+The main reason for using Bluestein's algorithm is that it applies for the DFT computation of any input length :math:`N`, including prime lengths. When a fast Fourier transform (FFT) algorithm is used to compute the DFT, such as Stockham or Cooley-Tukey, it provides optimized length support via a given radix or combination of radices, e.g., :math:`N = 2, \ 3, \ 5, \ 25 \times 2, \ 16 \times 9`, and so on. Considering that the DFTs via Bluestein can be carried out with any length satisfying :math:`M \geq 2N-1`, a suitably chosen value of :math:`M` can be used to compute the  convolution via an FFT with existing radix support. However, it should be mentioned that the Bluestein DFT computation is much slower than directly computing the DFT equation via an FFT with a supported length, even though both computations posses the same complexity of :math:`O(N \log N)`.
+
+Implementation
+--------------
+
+An illustration of the steps required for Bluestein's algorithm is given in the figure below.
+
+.. figure:: images/bluestein_fig1.png
+
+   Diagram of computations involved in Bluestein's algorithm
+
+
+A few observations can be made from the block diagram.  First, it can be seen that there are no direct dependencies between the two branches that compute :math:`\mathcal{F}\left\{ \mathbf{a} \right\}` and :math:`\mathcal{F}\left\{ \mathbf{b} \right\}` and, therefore, parallelization may be leveraged to speed-up the computations and perform the two sequence of operations independently. Second, it can further be seen that the chirp sequence is used multiple times throughout the diagram. Re-utilizing the computed chirp sequence across the operations where possible may also be advantageous. Third, there are quite a few number of operations in the diagram, and it is, therefore, often preferable to put together these operations into as little as possible device kernels, due to the overhead of kernel launch.
+
++++++++++++++++++++++++++++
+Device Kernel Configuration
++++++++++++++++++++++++++++
+
+Important factors to consider when designing an efficient implementation of Bluestein's algorithm are (1) the length of the DFT to be performed, (2) the size of available shared memory for the compute device at hand, and (3) the latency for launching device kernels. For instance, when the DFT length is small, all the operations in Bluestein's algorithm may be performed in a single device kernel, if data can fit into shared memory. This minimizes kernel launching overhead and provides the best performance.
+
+In the case where the DFT length is large and the entire data does not fit into shared memory, a hierarchical approach is utilized where the large FFT is decomposed into smaller FFT device kernels that fit into shared memory for improved performance. In this large length DFT scenario, it is important to minimize the number of device kernels utilized in the implementation for reduced kernel launch overhead. 
+
+The default implementation for Bluestein's algorithm when applied to large length DFTs is illustrated in the diagram below.
+
+.. figure:: images/bluestein_fig2.png
+
+   Default device kernel configuration for Bluestein's algorithm and large length DFTs
+
+As can be seen from the diagram, Bluestein's algorithm is performed with (at least) six kernels in a single device stream. The chirp sequence is computed in a single chirp kernel, and the sequence is re-utilized at later stages via a temporary device buffer. The two forward DFTs are joined together in one fft device node. This is possible because the padded sequences :math:`\mathbf{a}` and :math:`\mathbf{b}` are contiguous in the temporary device buffer used in the implementation, thus allowing for a single fft node to perform the two fft operations. The inverse FFT operation requires a separate ifft device node. Similarly, the three point-wise multiplications are carried out with separate kernels, pad\_mul, fft\_mul, and res\_mul. 
+
+Note that the fft (or ifft) nodes are usually split into at least two device kernels for large length DFTs. For example, a large 1D input data vector is viewed as a matrix (with same number of elements as the large vector), and the first FFT device kernel operates on rows of the data matrix while the second device kernel operates on the columns of the data matrix. In this scenario, a total of 8 device kernels are used to perform Bluestein's algorithm.
+
+
+++++++++++++++++++++++++++++++++++++++++++
+Optimizing Bluestein for large length DFTs
+++++++++++++++++++++++++++++++++++++++++++
+
+The default implementation of Bluestein's algorithm for large length DFTs can be optimized by following the design principles:
+
+#. Use the convolution as a building block for the implementation.
+#. Minimize the number of device kernels by fusing FFT read and write operations with Bluestein operations.
+#. Move computation of the chirp sequence from the FFT execution phase to the plan creation phase in rocFFT.
+
+The convolution building block is shown in the diagram below.
+
+.. figure:: images/bluestein_fig3.png
+
+   Proposed configuration of device kernels for fast convolution
+
+In the building block, two independent FFT device nodes are used to carry out the forward DFTs. The point-wise multiplication of the two forward DFTs is fused with the read operation of the iFFT device node. Arranging the convolution in this configuration has two advantages. The independence of the two forward FFT nodes means that parallelism may be leveraged, since the two foward FFT nodes may be executed concurrently if required. Fusing the point-wise multiplication of the two foward DFTs means that a separate kernel for performing the point-wise multiplication is no longer required, thus reducing device kernel launch latency.
+
+A typical use case of the rocFFT library is to create an FFT plan device handle once, and perform FFTs on multiple input data using this same plan handle. As shown in the diagram of Bluestein's algorithm, the chirp sequence :math:`\mathbf{c}` is independent from the input sequence :math:`\mathbf{x}`. Since the execution phase of rocFFT depends only on the input sequence, it is advantageous to precompute :math:`\mathbf{c}` at the plan creation phase of the library. That way, it is not always required to compute :math:`\mathbf{c}` when an FFT is executed, thus reducing the overal amount of computations.
+
+Based upon the three design principles above, an optimized implementation of Bluestein's algorithm is described in the diagram below.
+
+.. figure:: images/bluestein_fig4.png
+
+   Proposed configuration of device kernels for Bluestein's algorithm
+
+As can be seen from the diagram, the implementation of Bluestein's algorithm is quite similar to the fast convolution implementation. The main difference between the two implementations is that the foward/inverse DFT stages have additional fused operations in them. Compared to the default Bluestein implementation, at least three device nodes are used in the optimization. When using the row/column FFT decomposition for large lengths, this brings to a total of 6 device kernels in the optimization, a significant redution in the number of kernels compared to the default configuration.
+
+The read operation of the first DFT stage is fused with chirp + point-wise multiplication + padding. The read operation of the second DFT stage is fused with the chirp + padding. Similarly, the point-wise multiplication of the two forward DFTs is fused with the read operation of the inverse DFT node, and the chirp + point-wise multiplication is fused with its write operation. Since the chirp sequence is computed at the plan level, the chirp operations are performed by simply loading the computed chirp table into device registers. 
+
+Parallelization of the first two FFT nodes can be employed in the optimized implementation, however, preliminary tests have shown that in practice not much performance is gained by executing the two nodes simultaneously. The main reason for this is due to the fact that a synchronization step is required after the two forward DFT stages. This is denoted by the thin solid rectangle in the diagram. Another factor that needs to be taken into account is that in practice the amount of computation performed on the second FFT node is usually much smaller than the first FFT node. A typical use case of the rocFFT library is to perform batched FFTs. In this scenario, the amount of computation in the two forward FFT nodes is unbalanced since multiple FFTs are performed on the first node while only a single FFT is performed on the second node. This unbalance between the independent nodes makes the benefits of parallelization less pronounced.
+
+One last technical aspect of the optimization is the need to have separate transform and contiguous data indices across the multiple FFT nodes. Since the FFT nodes decompose a large length FFT into a column and a row FFT, the device kernels need to keep track of a global transform index to properly perform the fused read/write Bluestein operations. A similar concept is required for the data index, as the temporary buffers utilized for the computations are accessed in a contiguous fashion for minimal storage requirements.
+
diff -Nru rocfft-5.5.0/docs/design/design.rst rocfft-5.7.1/docs/design/design.rst
--- rocfft-5.5.0/docs/design/design.rst	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/docs/design/design.rst	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,12 @@
+================
+Design Documents
+================
+
+.. toctree::
+   :maxdepth: 3
+   :caption: Contents:
+
+   codegen
+   runtime_compilation
+   buffer_assignment
+   bluestein
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/design/images/bluestein_fig1.png and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/design/images/bluestein_fig1.png differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/design/images/bluestein_fig2.png and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/design/images/bluestein_fig2.png differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/design/images/bluestein_fig3.png and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/design/images/bluestein_fig3.png differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/design/images/bluestein_fig4.png and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/design/images/bluestein_fig4.png differ
diff -Nru rocfft-5.5.0/docs/index.rst rocfft-5.7.1/docs/index.rst
--- rocfft-5.5.0/docs/index.rst	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/docs/index.rst	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,437 @@
+
+.. toctree::
+   :maxdepth: 4 
+   :caption: Contents:
+
+======
+rocFFT
+======
+
+Introduction
+------------
+
+The rocFFT library is an implementation of the discrete Fast Fourier Transform (FFT) written in HIP for GPU devices.
+The code is open and hosted here: https://github.com/ROCmSoftwarePlatform/rocFFT
+
+The rocFFT library:
+
+* Provides a fast and accurate platform for calculating discrete FFTs.
+* Supports half (FP16), single, and double precision floating point formats.
+* Supports 1D, 2D, and 3D transforms.
+* Supports computation of transforms in batches.
+* Supports real and complex FFTs.
+* Supports arbitrary lengths, with optimizations for combinations of
+  powers of 2, 3, 5, 7, 11, 13, and 17.
+
+FFT Computation
+---------------
+
+The FFT is an implementation of the Discrete Fourier Transform (DFT) that makes use of symmetries in the DFT definition to
+reduce the mathematical complexity from :math:`O(N^2)` to :math:`O(N \log N)`.
+
+What is computed by the library? Here are the formulas:
+
+For a 1D complex DFT:
+
+:math:`{\tilde{x}}_j = \sum_{k=0}^{n-1}x_k\exp\left({\pm i}{{2\pi jk}\over{n}}\right)\hbox{ for } j=0,1,\ldots,n-1`
+
+where, :math:`x_k` are the complex data to be transformed, :math:`\tilde{x}_j` are the transformed data, and the sign :math:`\pm`
+determines the direction of the transform: :math:`-` for forward and :math:`+` for backward.
+
+For a 2D complex DFT:
+
+:math:`{\tilde{x}}_{jk} = \sum_{q=0}^{m-1}\sum_{r=0}^{n-1}x_{rq}\exp\left({\pm i} {{2\pi jr}\over{n}}\right)\exp\left({\pm i}{{2\pi kq}\over{m}}\right)`
+
+for :math:`j=0,1,\ldots,n-1\hbox{ and } k=0,1,\ldots,m-1`, where, :math:`x_{rq}` are the complex data to be transformed,
+:math:`\tilde{x}_{jk}` are the transformed data, and the sign :math:`\pm` determines the direction of the transform.
+
+For a 3D complex DFT:
+
+:math:`\tilde{x}_{jkl} = \sum_{s=0}^{p-1}\sum_{q=0}^{m-1}\sum_{r=0}^{n-1}x_{rqs}\exp\left({\pm i} {{2\pi jr}\over{n}}\right)\exp\left({\pm i}{{2\pi kq}\over{m}}\right)\exp\left({\pm i}{{2\pi ls}\over{p}}\right)`
+
+for :math:`j=0,1,\ldots,n-1\hbox{ and } k=0,1,\ldots,m-1\hbox{ and } l=0,1,\ldots,p-1`, where :math:`x_{rqs}` are the complex data to
+be transformed, :math:`\tilde{x}_{jkl}` are the transformed data, and the sign :math:`\pm` determines the direction of the transform.
+
+Library Setup and Cleanup
+-------------------------
+
+At the beginning of the program, before any of the library APIs are called, the function :cpp:func:`rocfft_setup` has to be called. Similarly,
+the function :cpp:func:`rocfft_cleanup` has to be called at the end of the program. These APIs ensure resources are properly allocated and freed.
+
+Workflow
+--------
+
+In order to compute an FFT with rocFFT, a plan has to be created first. A plan is a handle to an internal data structure that
+holds the details about the transform that the user wishes to compute. After the plan is created, it can be executed (a separate API call)
+with the specified data buffers. The execution step can be repeated any number of times with the same plan on different input/output buffers
+as needed. And when the plan is no longer needed, it gets destroyed.
+
+To do a transform,
+
+#. Initialize the library by calling :cpp:func:`rocfft_setup()`.
+#. Create a plan, for each distinct type of FFT needed:
+
+   * To create a plan, do either of the following
+
+     * If the plan specification is simple, call :cpp:func:`rocfft_plan_create` and specify the value of the fundamental parameters.
+     * If the plan has more details, first a plan description is created with :cpp:func:`rocfft_plan_description_create`, and additional APIs such
+       as :cpp:func:`rocfft_plan_description_set_data_layout` are called to specify plan details. And then, :cpp:func:`rocfft_plan_create` is called
+       with the description handle passed to it along with other details.
+
+   * Optionally, allocate a work buffer for the plan:
+
+     * Call :cpp:func:`rocfft_plan_get_work_buffer_size` to check the size of work buffer required by the plan.
+     * If a nonzero size is required:
+
+       * Create an execution info object with :cpp:func:`rocfft_execution_info_create`.
+       * Allocate a buffer using :cpp:func:`hipMalloc` and pass the allocated buffer to :cpp:func:`rocfft_execution_info_set_work_buffer`.
+
+#. Execute the plan:
+
+   * The execution API :cpp:func:`rocfft_execute` is used to do the actual computation on the data buffers specified.
+   * Extra execution information such as work buffers and compute streams are passed to :cpp:func:`rocfft_execute` in the :cpp:type:`rocfft_execution_info` object.
+   * :cpp:func:`rocfft_execute` can be called repeatedly as needed for different data, with the same plan.
+   * If the plan requires a work buffer but none was provided, :cpp:func:`rocfft_execute` will automatically allocate a work buffer and free it when execution is finished.
+
+#. If a work buffer was allocated:
+
+   * Call :cpp:func:`hipFree` to free the work buffer.
+   * Call :cpp:func:`rocfft_execution_info_destroy` to destroy the execution info object.
+
+#. Destroy the plan by calling :cpp:func:`rocfft_plan_destroy`.
+#. Terminate the library by calling :cpp:func:`rocfft_cleanup()`.
+
+
+Example
+-------
+
+.. code-block:: c
+
+   #include <iostream>
+   #include <vector>
+   #include "hip/hip_runtime_api.h"
+   #include "hip/hip_vector_types.h"
+   #include "rocfft.h"
+   
+   int main()
+   {
+           // rocFFT gpu compute
+           // ========================================
+  
+           rocfft_setup();
+
+           size_t N = 16;
+           size_t Nbytes = N * sizeof(float2);
+   
+           // Create HIP device buffer
+           float2 *x;
+           hipMalloc(&x, Nbytes);
+   
+           // Initialize data
+           std::vector<float2> cx(N);
+           for (size_t i = 0; i < N; i++)
+           {
+                   cx[i].x = 1;
+                   cx[i].y = -1;
+           }
+   
+           //  Copy data to device
+           hipMemcpy(x, cx.data(), Nbytes, hipMemcpyHostToDevice);
+   
+           // Create rocFFT plan
+           rocfft_plan plan = nullptr;
+           size_t length = N;
+           rocfft_plan_create(&plan, rocfft_placement_inplace,
+                rocfft_transform_type_complex_forward, rocfft_precision_single,
+                1, &length, 1, nullptr);
+
+	   // Check if the plan requires a work buffer
+	   size_t work_buf_size = 0;
+	   rocfft_plan_get_work_buffer_size(plan, &work_buf_size);
+	   void* work_buf = nullptr;
+	   rocfft_execution_info info = nullptr;
+	   if(work_buf_size)
+           {
+                   rocfft_execution_info_create(&info);
+		   hipMalloc(&work_buf, work_buf_size);
+		   rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size);
+           }
+   
+           // Execute plan
+           rocfft_execute(plan, (void**) &x, nullptr, info);
+   
+           // Wait for execution to finish
+           hipDeviceSynchronize();
+
+	   // Clean up work buffer
+	   if(work_buf_size)
+	   {
+	           hipFree(work_buf);
+		   rocfft_execution_info_destroy(info);
+	   }
+
+           // Destroy plan
+           rocfft_plan_destroy(plan);
+   
+           // Copy result back to host
+           std::vector<float2> y(N);
+           hipMemcpy(y.data(), x, Nbytes, hipMemcpyDeviceToHost);
+   
+           // Print results
+           for (size_t i = 0; i < N; i++)
+           {
+                   std::cout << y[i].x << ", " << y[i].y << std::endl;
+           }
+   
+           // Free device buffer
+           hipFree(x);
+   
+           rocfft_cleanup();
+
+           return 0;
+   }
+
+Plans
+-----
+
+A plan is the collection of (almost) all the parameters needed to specify an FFT computation. A rocFFT plan includes the
+following information:
+
+* Type of transform (complex or real)
+* Dimension of the transform (1D, 2D, or 3D)
+* Length or extent of data in each dimension
+* Number of datasets that are transformed (batch size)
+* Floating-point precision of the data
+* In-place or not in-place transform
+* Format (array type) of the input/output buffer
+* Layout of data in the input/output buffer 
+* Scaling factor to apply to the output of the transform
+
+The rocFFT plan does not include the following parameters:
+
+* The handles to the input and output data buffers.
+* The handle to a temporary work buffer (if needed).
+* Other information to control execution on the device.
+
+These parameters are specified when the plan is executed.
+
+Data
+----
+
+The input/output buffers that hold the data for the transform must be allocated, initialized and specified to the library by the
+user. For larger transforms, temporary work buffers may be needed. Because the library tries to minimize its own allocation of
+memory regions on the device, it expects the user to manage work buffers. The size of the buffer needed can be queried using
+:cpp:func:`rocfft_plan_get_work_buffer_size` and after their allocation can be passed to the library by
+:cpp:func:`rocfft_execution_info_set_work_buffer`. The samples in the source repository show how to use these.
+
+Transform and Array types 
+-------------------------
+
+There are two main types of FFTs in the library:
+
+* Complex FFT - Transformation of complex data (forward or backward); the library supports the following two
+  array types to store complex numbers:
+
+  #. Planar format - where the real and imaginary components are kept in 2 separate arrays:
+
+     * Buffer1: ``RRRRR...`` 
+     * Buffer2: ``IIIII...``
+  #. Interleaved format - where the real and imaginary components are stored as contiguous pairs in the same array: 
+
+     * Buffer: ``RIRIRIRIRIRI...``
+  
+* Real FFT - Transformation of real data. For transforms involving real data, there are two possibilities:
+
+  * Real data being subject to forward FFT that results in complex data (Hermitian).
+  * Complex data (Hermitian) being subject to backward FFT that results in real data.
+
+.. note::
+
+   Real backward FFTs require that the input data be
+   Hermitian-symmetric, as would naturally happen in the output of a
+   real forward FFT.  rocFFT will produce undefined results if
+   this requirement is not met.
+
+The library provides the :cpp:enum:`rocfft_transform_type` and
+:cpp:enum:`rocfft_array_type` enums to specify transform and array
+types, respectively.
+
+Batches
+-------
+
+The efficiency of the library is improved by utilizing transforms in batches. Sending as much data as possible in a single
+transform call leverages the parallel compute capabilities of devices (GPU devices in particular), and minimizes the penalty
+of control transfer. It is best to think of a device as a high-throughput, high-latency device. Using a networking analogy as
+an example, this approach is similar to having a massively high-bandwidth pipe with very high ping response times. If the client
+is ready to send data to the device for compute, it should be sent in as few API calls as possible, and this can be done by batching.
+rocFFT plans have a parameter `number_of_transforms` (this value is also referred to as batch size in various places in the document)
+in :cpp:func:`rocfft_plan_create` to describe the number of transforms being requested. All 1D, 2D, and 3D transforms can be batched.
+
+.. _resultplacement:
+
+Result placement
+----------------
+
+The API supports both in-place and not in-place transforms via the :cpp:enum:`rocfft_result_placement` enum.  With in-place transforms, only input buffers are provided to the
+execution API, and the resulting data is written to the same buffer, overwriting the input data.  With not in-place transforms, distinct
+output buffers are provided, and the results are written into the output buffer.
+
+Note that rocFFT may still modify the input buffer even if a transform is requested to be not in-place.  Real-complex transforms in particular are more efficient if they can modify the original input.
+
+Strides and Distances
+---------------------
+
+Strides and distances enable users to specify custom layout of data using :cpp:func:`rocfft_plan_description_set_data_layout`.
+
+For 1D data, if :cpp:expr:`strides[0] == strideX == 1`, successive elements in the first dimension (dimension index 0) are stored
+contiguously in memory. If :cpp:expr:`strideX` is a value greater than 1, gaps in memory exist between each element of the vector.
+For multi-dimensional cases; if :cpp:expr:`strides[1] == strideY == LenX` for 2D data and :cpp:expr:`strides[2] == strideZ == LenX * LenY` for 3D data,
+no gaps exist in memory between each element, and all vectors are stored tightly packed in memory. Here, :cpp:expr:`LenX`, :cpp:expr:`LenY`, and :cpp:expr:`LenZ` denote the
+transform lengths :cpp:expr:`lengths[0]`, :cpp:expr:`lengths[1]`, and :cpp:expr:`lengths[2]`, respectively, which are used to set up the plan.
+
+Distance is the stride that exists between corresponding elements of successive FFT data instances (primitives) in a batch. Distance is measured in units of the memory type;
+complex data measures in complex units, and real data measures in real units. For tightly packed data, the distance between FFT primitives is the size of the FFT primitive,
+such that :cpp:expr:`dist == LenX` for 1D data, :cpp:expr:`dist == LenX * LenY` for 2D data, and :cpp:expr:`dist == LenX * LenY * LenZ` for 3D data. It is possible to set the distance of a plan to be less than the size
+of the FFT vector; typically 1 when doing column (strided) access on packed data. When computing a batch of 1D FFT vectors, if :cpp:expr:`distance == 1`, and :cpp:expr:`strideX == length(vector)`,
+it means data for each logical FFT is read along columns (in this case along the batch). You must verify that the distance and strides are valid, such that each logical
+FFT instance is not overlapping with any other; if not valid, undefined results may occur. A simple example would be to perform a 1D length 4096 on each row of an array of
+1024 rows x 4096 columns of values stored in a column-major array, such as a FORTRAN program might provide. (This would be equivalent to a C or C++ program that has an
+array of 4096 rows x 1024 columns stored in a row-major manner, on which you want to perform a 1D length 4096 transform on each column.) In this case, specify the
+strides as [1024] and distance as 1.
+
+Overwriting non-contiguous buffers
+==================================
+
+rocFFT guarantees that both the reading of FFT input and the writing of FFT output will respect the
+specified strides.  However, temporary results can potentially be written to these buffers
+contiguously, which may be unexpected if the strides would avoid certain memory locations completely
+for reading and writing.
+
+For example, a 1D FFT of length :math:`N` with input and output stride of 2 is transforming only
+even-indexed elements in the input and output buffers.  But if temporary data needs to be written to
+the buffers, odd-indexed elements may be overwritten.
+
+However, rocFFT is guaranteed to respect the size of buffers.  In the above example, the
+input/output buffers are :math:`2N` elements long, even if only :math:`N` even-indexed
+elements are being transformed.  No more than :math:`2N` elements of temporary data will be written
+to the buffers during the transform.
+
+These policies apply to both input and output buffers, because :ref:`not in-place transforms may overwrite input data<resultplacement>`.
+
+Transforms of real data
+-----------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   real
+
+Result Scaling
+--------------
+
+The output of a forward or backward FFT often needs to be multiplied
+by a scaling factor before the data can be passed to the next step of
+a computation.  While users of rocFFT can launch a separate GPU
+kernel to do this work, rocFFT provides a
+:cpp:func:`rocfft_plan_description_set_scale_factor` function to more
+efficiently combine this scaling multiplication with the FFT work.
+
+The scaling factor is set on the plan description prior to plan creation.
+
+Load and Store Callbacks
+------------------------
+
+rocFFT includes experimental functionality to call user-defined device functions
+when loading input from global memory at the start of a transform, or
+when storing output to global memory at the end of a transform.
+
+These user-defined callback functions may be optionally supplied
+to the library using
+:cpp:func:`rocfft_execution_info_set_load_callback` and
+:cpp:func:`rocfft_execution_info_set_store_callback`.
+
+Device functions supplied as callbacks must load and store element
+data types that are appropriate for the transform being performed.
+
++-------------------------+--------------------+----------------------+
+|Transform type           | Load element type  | Store element type   |
++=========================+====================+======================+
+|Complex-to-complex,      | `_Float16_2`       | `_Float16_2`         |
+|half-precision           |                    |                      |
++-------------------------+--------------------+----------------------+
+|Complex-to-complex,      | `float2`           | `float2`             |
+|single-precision         |                    |                      |
++-------------------------+--------------------+----------------------+
+|Complex-to-complex,      | `double2`          | `double2`            |
+|double-precision         |                    |                      |
++-------------------------+--------------------+----------------------+
+|Real-to-complex,         | `float`            | `float2`             |
+|single-precision         |                    |                      |
++-------------------------+--------------------+----------------------+
+|Real-to-complex,         | `_Float16`         | `_Float16_2`         |
+|half-precision           |                    |                      |
++-------------------------+--------------------+----------------------+
+|Real-to-complex,         | `double`           | `double2`            |
+|double-precision         |                    |                      |
++-------------------------+--------------------+----------------------+
+|Complex-to-real,         | `_Float16_2`       | `_Float16`           |
+|half-precision           |                    |                      |
++-------------------------+--------------------+----------------------+
+|Complex-to-real,         | `float2`           | `float`              |
+|single-precision         |                    |                      |
++-------------------------+--------------------+----------------------+
+|Complex-to-real,         | `double2`          | `double`             |
+|double-precision         |                    |                      |
++-------------------------+--------------------+----------------------+
+
+The callback function signatures must match the specifications
+below.
+
+.. code-block:: c
+
+  T load_callback(T* buffer, size_t offset, void* callback_data, void* shared_memory);
+  void store_callback(T* buffer, size_t offset, T element, void* callback_data, void* shared_memory);
+
+The parameters for the functions are defined as:
+
+* `T`: The data type of each element being loaded or stored from the
+  input or output.
+* `buffer`: Pointer to the input (for load callbacks) or
+  output (for store callbacks) in device memory that was passed to
+  :cpp:func:`rocfft_execute`.
+* `offset`: The offset of the location being read from or written
+  to.  This counts in elements, from the `buffer` pointer.
+* `element`: For store callbacks only, the element to be stored.
+* `callback_data`: A pointer value accepted by
+  :cpp:func:`rocfft_execution_info_set_load_callback` and
+  :cpp:func:`rocfft_execution_info_set_store_callback` which is passed
+  through to the callback function.
+* `shared_memory`: A pointer to an amount of shared memory requested
+  when the callback is set.  Currently, shared memory is not supported
+  and this parameter is always null.
+
+Callback functions are called exactly once for each element being
+loaded or stored in a transform.  Note that multiple kernels may be
+launched to decompose a transform, which means that separate kernels
+may call the load and store callbacks for a transform if both are
+specified.
+
+Currently, callbacks functions are only supported for transforms that
+do not use planar format for input or output.
+
+Runtime compilation
+-------------------
+
+rocFFT includes many kernels for common FFT problems.  Some plans may
+require additional kernels aside from what is built in to the
+library.  In these cases, rocFFT will compile optimized kernels for
+the plan when the plan is created.
+
+Compiled kernels are stored in memory by default and will be reused
+if they are required again for plans in the same process.
+
+If the ``ROCFFT_RTC_CACHE_PATH`` environment variable is set to a
+writable file location, rocFFT will write compiled kernels to this
+location.  rocFFT will read kernels from this location for plans in
+other processes that need runtime-compiled kernels.  rocFFT will
+create the specified file if it does not already exist.
diff -Nru rocfft-5.5.0/docs/real.rst rocfft-5.7.1/docs/real.rst
--- rocfft-5.5.0/docs/real.rst	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/docs/real.rst	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,146 @@
+Real data
+---------
+
+When real data is subject to DFT, the resulting complex output data follows a special property. About half of the
+output is redundant because they are complex conjugates of the other half. This is called the Hermitian redundancy. So, for space
+and performance considerations, it is only necessary to store the non-redundant part of the data. Most FFT libraries use this property to
+offer specific storage layouts for FFTs involving real data. rocFFT
+provides three enumeration values for :cpp:enum:`rocfft_array_type` to deal with real data FFTs:
+
+* REAL (:cpp:enumerator:`rocfft_array_type_real`)
+* HERMITIAN_INTERLEAVED (:cpp:enumerator:`rocfft_array_type_hermitian_interleaved`)
+* HERMITIAN_PLANAR (:cpp:enumerator:`rocfft_array_type_hermitian_planar`)
+
+The REAL (:cpp:enumerator:`rocfft_array_type_real`) enum specifies that the data is purely real. This can be used to feed real input or get back real output. The
+HERMITIAN_INTERLEAVED
+(:cpp:enumerator:`rocfft_array_type_hermitian_interleaved`) and HERMITIAN_PLANAR (:cpp:enumerator:`rocfft_array_type_hermitian_planar`) enums are similar to the corresponding full complex enums in the way
+they store real and imaginary components, but store only about half of the complex output. Client applications can do just a
+forward transform and analyze the output or they can process the output and do a backward transform to get back real data.
+This is illustrated in the following figure.
+
+.. figure:: ./data/images/realfft_fwdinv.jpg
+
+   **Forward and Backward Real FFTs**
+
+.. note::
+
+   Real backward FFTs require that the input data be
+   Hermitian-symmetric, as would naturally happen in the output of a
+   real forward FFT.  rocFFT will produce undefined results if
+   this requirement is not met.
+
+Let us consider a 1D real FFT of length :math:`N`. The full output looks as shown in following figure.
+
+.. figure:: ./data/images/realfft_1dlen.jpg
+
+   **1D Real FFT of Length N**
+
+Here, C* denotes the complex conjugate. Since the values at indices greater than :math:`N/2` can be deduced from the first half
+of the array, rocFFT stores data only up to the index :math:`N/2`. This means that the output contains only :math:`1 + N/2` complex
+elements, where the division :math:`N/2` is rounded down. Examples for even and odd lengths are given below.
+
+Example for :math:`N = 8` is shown in following figure.
+
+.. figure:: ./data/images/realfft_ex_n8.jpg
+
+   **Example for N = 8**
+
+Example for :math:`N = 7` is shown in following figure.
+
+.. figure:: ./data/images/realfft_ex_n7.jpg
+
+   **Example for N = 7**
+
+For length 8, only :math:`(1 + 8/2) = 5` of the output complex numbers are stored, with the index ranging from 0 through 4.
+Similarly for length 7, only :math:`(1 + 7/2) = 4` of the output complex numbers are stored, with the index ranging from 0 through 3.
+For 2D and 3D FFTs, the FFT length along the innermost dimension is used to compute the :math:`(1 + N/2)` value. This is because
+the FFT along the innermost dimension is computed first and is logically a real-to-hermitian transform. The FFTs along
+other dimensions are computed next, and they are simply 'complex-to-complex' transforms. For example, assuming :math:`Lengths[2]`
+is used to set up a 2D real FFT, let :math:`N1 = Lengths[1]`, and :math:`N0 = Lengths[0]`. The output FFT has :math:`N1*(1 + N0/2)` complex elements.
+Similarly, for a 3D FFT with :math:`Lengths[3]` and :math:`N2 = Lengths[2]`, :math:`N1 = Lengths[1]`, and :math:`N0 = Lengths[0]`, the output has :math:`N2*N1*(1 + N0/2)`
+complex elements.
+
+Supported array type combinations
+---------------------------------
+
+Not In-place transforms:
+
+* Forward:  REAL to HERMITIAN_INTERLEAVED
+* Forward:  REAL to HERMITIAN_PLANAR
+* Backward: HERMITIAN_INTERLEAVED to REAL
+* Backward: HERMITIAN_PLANAR to REAL
+
+In-place transforms:
+
+* Forward:  REAL to HERMITIAN_INTERLEAVED
+* Backward: HERMITIAN_INTERLEAVED to REAL
+
+Setting strides
+---------------
+
+The library currently requires the user to explicitly set input and output strides for real transforms for non simple cases.
+See the following examples to understand what values to use for input and output strides under different scenarios. These examples show
+typical usages, but the user can allocate the buffers and choose data layout according to their need.
+
+Examples
+--------
+
+The following figures and examples explain in detail the real FFT features of this library.
+
+Here is a schematic that illustrates the forward 1D FFT (real to hermitian).
+
+.. figure:: ./data/images/realfft_expl_01.jpg
+
+   **1D FFT - Real to Hermitian**
+
+Below is a schematic that shows an example of not in-place transform with even :math:`N` and how strides and distances are set.
+
+.. figure:: ./data/images/realfft_expl_02.jpg
+
+   **1D FFT - Real to Hermitian, Example 1**
+
+Below is a schematic that shows an example of in-place transform with even :math:`N` and how strides and distances are set.
+Notice that even though we are dealing with only 1 buffer (in-place), the output strides/distance can take different
+values compared to input strides/distance.
+
+.. figure:: ./data/images/realfft_expl_03.jpg
+
+   **1D FFT - Real to Hermitian, Example 2**
+
+Below is a schematic that shows an example of in-place transform with odd :math:`N` and how strides and distances are set.
+Notice that even though we are dealing with only 1 buffer (in-place), the output strides/distance can take different
+values compared to input strides/distance.
+
+.. figure:: ./data/images/realfft_expl_04.jpg
+
+   **1D FFT - Real to Hermitian, Example 3**
+
+And here is a schematic that illustrates the backward 1D FFT (hermitian to real).
+
+.. figure:: ./data/images/realfft_expl_05.jpg
+
+   **1D FFT - Hermitian to Real**
+
+Below is a schematic that shows an example of in-place transform with even :math:`N` and how strides and distances are set.
+Notice that even though we are dealing with only 1 buffer (in-place), the output strides/distance can take different
+values compared to input strides/distance.
+
+.. figure:: ./data/images/realfft_expl_06.jpg
+
+   **1D FFT - Hermitian to Real, Example**
+
+And here is a schematic that illustrates the in-place forward 2D FFT (real to hermitian) .
+
+.. figure:: ./data/images/realfft_expl_07.jpg
+
+   **2D FFT - Real to Hermitian In Place**
+
+Below is a schematic that shows an example of in-place 2D transform and how strides and distances are set.
+Notice that even though we are dealing with only 1 buffer (in-place), the output strides/distance can take different
+values compared to input strides/distance.
+
+.. figure:: ./data/images/realfft_expl_08.jpg
+
+   **2D FFT - Real to Hermitian, Example**
+
+
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/rocm.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/rocm.jpg differ
diff -Nru rocfft-5.5.0/docs/run_doc.sh rocfft-5.7.1/docs/run_doc.sh
--- rocfft-5.5.0/docs/run_doc.sh	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/docs/run_doc.sh	1970-01-01 00:00:00.000000000 +0000
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-set -eu
-
-# Make this directory the PWD
-cd "$(dirname "${BASH_SOURCE[0]}")"
-
-# Update version string
-cur_version=$(sed -n -e "s/^.*VERSION_STRING.* \"\([0-9\.]\{1,\}\).*/\1/p" ../CMakeLists.txt)
-sed -i -e "s/\(PROJECT_NUMBER.*=\)\(.*\)/\1 v${cur_version}/" Doxyfile
-sed -i -e "s/\(version.*=\)\(.*\)/\1 u'${cur_version}'/" source/conf.py
-sed -i -e "s/\(release.*=\)\(.*\)/\1 u'${cur_version}'/" source/conf.py
-
-# Build doxygen info
-rm -rf docBin
-doxygen Doxyfile
-
-# Build sphinx docs
-cd source
-make clean
-make html
-make latexpdf
-make man
-make text
-make texinfo
diff -Nru rocfft-5.5.0/docs/run_doxygen.sh rocfft-5.7.1/docs/run_doxygen.sh
--- rocfft-5.5.0/docs/run_doxygen.sh	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/docs/run_doxygen.sh	1970-01-01 00:00:00.000000000 +0000
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-set -eu
-
-# Make this directory the PWD
-cd "$(dirname "${BASH_SOURCE[0]}")"
-
-# Update version string
-cur_version=$(sed -n -e "s/^.*VERSION_STRING.* \"\([0-9\.]\{1,\}\).*/\1/p" ../CMakeLists.txt)
-sed -i -e "s/\(PROJECT_NUMBER.*=\)\(.*\)/\1 v${cur_version}/" Doxyfile
-
-# Build the doxygen info
-rm -rf docBin
-doxygen Doxyfile
diff -Nru rocfft-5.5.0/docs/samples/complex_1d.cpp rocfft-5.7.1/docs/samples/complex_1d.cpp
--- rocfft-5.5.0/docs/samples/complex_1d.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/docs/samples/complex_1d.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -25,7 +25,7 @@
 
 #include <hip/hip_runtime_api.h>
 
-#include "rocfft.h"
+#include <rocfft/rocfft.h>
 
 int main(int argc, char* argv[])
 {
diff -Nru rocfft-5.5.0/docs/samples/complex_2d.cpp rocfft-5.7.1/docs/samples/complex_2d.cpp
--- rocfft-5.5.0/docs/samples/complex_2d.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/docs/samples/complex_2d.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -25,7 +25,7 @@
 
 #include <hip/hip_runtime_api.h>
 
-#include "rocfft.h"
+#include <rocfft/rocfft.h>
 
 int main(int argc, char* argv[])
 {
diff -Nru rocfft-5.5.0/docs/samples/complex_3d.cpp rocfft-5.7.1/docs/samples/complex_3d.cpp
--- rocfft-5.5.0/docs/samples/complex_3d.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/docs/samples/complex_3d.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -25,7 +25,7 @@
 
 #include <hip/hip_runtime_api.h>
 
-#include "rocfft.h"
+#include <rocfft/rocfft.h>
 
 int main(int argc, char* argv[])
 {
diff -Nru rocfft-5.5.0/docs/samples/real2complex_1d.cpp rocfft-5.7.1/docs/samples/real2complex_1d.cpp
--- rocfft-5.5.0/docs/samples/real2complex_1d.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/docs/samples/real2complex_1d.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -25,7 +25,7 @@
 
 #include <hip/hip_runtime_api.h>
 
-#include "rocfft.h"
+#include <rocfft/rocfft.h>
 
 int main(int argc, char* argv[])
 {
diff -Nru rocfft-5.5.0/docs/samples/real2complex_2d.cpp rocfft-5.7.1/docs/samples/real2complex_2d.cpp
--- rocfft-5.5.0/docs/samples/real2complex_2d.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/docs/samples/real2complex_2d.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -25,7 +25,7 @@
 
 #include <hip/hip_runtime_api.h>
 
-#include "rocfft.h"
+#include <rocfft/rocfft.h>
 
 int main(int argc, char* argv[])
 {
diff -Nru rocfft-5.5.0/docs/samples/real2complex_3d.cpp rocfft-5.7.1/docs/samples/real2complex_3d.cpp
--- rocfft-5.5.0/docs/samples/real2complex_3d.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/docs/samples/real2complex_3d.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -25,7 +25,7 @@
 
 #include <hip/hip_runtime_api.h>
 
-#include "rocfft.h"
+#include <rocfft/rocfft.h>
 
 int main(int argc, char* argv[])
 {
diff -Nru rocfft-5.5.0/docs/source/Makefile rocfft-5.7.1/docs/source/Makefile
--- rocfft-5.5.0/docs/source/Makefile	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/docs/source/Makefile	1970-01-01 00:00:00.000000000 +0000
@@ -1,20 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
-SPHINXPROJ    = rocFFT
-SOURCEDIR     = .
-BUILDDIR      = _build
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff -Nru rocfft-5.5.0/docs/source/allapi.rst rocfft-5.7.1/docs/source/allapi.rst
--- rocfft-5.5.0/docs/source/allapi.rst	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/docs/source/allapi.rst	1970-01-01 00:00:00.000000000 +0000
@@ -1,11 +0,0 @@
-.. toctree::
-   :maxdepth: 4 
-   :caption: Contents:
-
-=============
-API Reference
-=============
-
-.. doxygenindex::
-
- 
diff -Nru rocfft-5.5.0/docs/source/api.rst rocfft-5.7.1/docs/source/api.rst
--- rocfft-5.5.0/docs/source/api.rst	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/docs/source/api.rst	1970-01-01 00:00:00.000000000 +0000
@@ -1,117 +0,0 @@
-.. toctree::
-   :maxdepth: 4 
-   :caption: Contents:
-
-=========
-API Usage
-=========
-
-This section describes usage of the rocFFT library API.
-
-Types
------
-
-There are a few data structures that are internal to the library. The pointer types to these
-structures are given below. The user would need to use these types to create handles and pass them
-between different library functions.
-
-.. doxygentypedef:: rocfft_plan
-
-.. doxygentypedef:: rocfft_plan_description
-
-.. doxygentypedef:: rocfft_execution_info
-
-Library Setup and Cleanup
--------------------------
-
-The following functions deal with initialization and cleanup of the library.
-
-.. doxygenfunction:: rocfft_setup
-
-.. doxygenfunction:: rocfft_cleanup
-
-Plan
-----
-
-The following functions are used to create and destroy plan objects.
-
-.. doxygenfunction:: rocfft_plan_create
-
-.. doxygenfunction:: rocfft_plan_destroy
-
-The following functions are used to query for information after a plan is created.
-
-.. doxygenfunction:: rocfft_plan_get_work_buffer_size
-
-.. doxygenfunction:: rocfft_plan_get_print
-
-Plan description
-----------------
-
-Most of the time, :cpp:func:`rocfft_plan_create` is able to fully
-specify a transform.  Advanced plan details such as strides and
-offsets require creation of a plan description object, which is
-configured and passed to the :cpp:func:`rocfft_plan_create` function.
-
-The plan description object can be safely destroyed after it is given
-to the :cpp:func:`rocfft_plan_create` function.
-
-.. doxygenfunction:: rocfft_plan_description_create
-
-.. doxygenfunction:: rocfft_plan_description_destroy
-
-.. doxygenfunction:: rocfft_plan_description_set_scale_factor
-
-.. doxygenfunction:: rocfft_plan_description_set_data_layout
-
-.. comment doxygenfunction:: rocfft_plan_description_set_devices
-
-Execution
----------
-
-After a plan has been created, it can be executed using the
-:cpp:func:`rocfft_execute` function,
-to compute a transform on specified data. Aspects of the execution can be controlled and any useful
-information returned to the user.
-
-.. doxygenfunction:: rocfft_execute
-
-Execution info
---------------
-
-:cpp:func:`rocfft_execute` takes an optional :cpp:type:`rocfft_execution_info` parameter. This parameter encapsulates
-information such as the work buffer and compute stream for the transform.
-
-.. doxygenfunction:: rocfft_execution_info_create
-
-.. doxygenfunction:: rocfft_execution_info_destroy
-
-.. doxygenfunction:: rocfft_execution_info_set_work_buffer
-
-.. comment doxygenfunction:: rocfft_execution_info_set_mode
-
-.. doxygenfunction:: rocfft_execution_info_set_stream
-
-.. comment doxygenfunction:: rocfft_execution_info_get_events
-
-
-Enumerations
-------------
-
-This section provides all the enumerations used.
-
-.. doxygenenum:: rocfft_status
-
-.. doxygenenum:: rocfft_transform_type
-
-.. doxygenenum:: rocfft_precision
-
-.. doxygenenum:: rocfft_result_placement
-
-.. doxygenenum:: rocfft_array_type
-
-.. comment doxygenenum:: rocfft_execution_mode
-
-
-
-
diff -Nru rocfft-5.5.0/docs/source/conf.py rocfft-5.7.1/docs/source/conf.py
--- rocfft-5.5.0/docs/source/conf.py	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/docs/source/conf.py	1970-01-01 00:00:00.000000000 +0000
@@ -1,173 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# rocFFT documentation build configuration file, created by
-# sphinx-quickstart on Mon Jan  8 16:34:42 2018.
-#
-# This file is execfile()d with the current directory set to its
-# containing dir.
-#
-# Note that not all possible configuration values are present in this
-# autogenerated file.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
-
-import os
-import sys
-import subprocess
-
-read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
-
-if read_the_docs_build:
-    subprocess.call('../run_doxygen.sh')
-
-# -- General configuration ------------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-#
-# needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = ['sphinx.ext.mathjax', 'breathe']
-breathe_projects = {"rocFFT": "../docBin/xml"}
-breathe_default_project = "rocFFT"
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-#
-# source_suffix = ['.rst', '.md']
-source_suffix = '.rst'
-
-# The master toctree document.
-master_doc = 'index'
-
-# General information about the project.
-project = u'rocFFT'
-copyright = u'2016 - 2022, Advanced Micro Devices'
-author = u'Advanced Micro Devices'
-
-# The version info for the project you're documenting, acts as replacement for
-# |version| and |release|, also used in various other places throughout the
-# built documents.
-#
-# The short X.Y version.
-version = u'1.0.21'
-# The full version, including alpha/beta/rc tags.
-release = u'1.0.21'
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = None
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This patterns also effect to html_static_path and html_extra_path
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
-
-# If true, `todo` and `todoList` produce output, else they produce nothing.
-todo_include_todos = False
-
-# -- Options for HTML output ----------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-# html_theme = 'alabaster'
-
-if read_the_docs_build:
-    html_theme = 'default'
-else:
-    import sphinx_rtd_theme
-    html_theme = "sphinx_rtd_theme"
-    html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#
-# html_theme_options = {}
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-# html_static_path = ['_static']
-
-# Custom sidebar templates, must be a dictionary that maps document names
-# to template names.
-#
-# This is required for the alabaster theme
-# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
-# html_sidebars = {
-#     '**': [
-#         'relations.html',  # needs 'show_related': True theme option to display
-#         'searchbox.html',
-#     ]
-# }
-
-# -- Options for HTMLHelp output ------------------------------------------
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'rocFFTdoc'
-
-# -- Options for LaTeX output ---------------------------------------------
-
-latex_elements = {
-    # The paper size ('letterpaper' or 'a4paper').
-    #
-    # 'papersize': 'letterpaper',
-
-    # The font size ('10pt', '11pt' or '12pt').
-    #
-    # 'pointsize': '10pt',
-
-    # Additional stuff for the LaTeX preamble.
-    #
-    # 'preamble': '',
-
-    # Latex figure (float) alignment
-    #
-    # 'figure_align': 'htbp',
-}
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title,
-#  author, documentclass [howto, manual, or own class]).
-latex_documents = [
-    (master_doc, 'rocFFT.tex', u'rocFFT Documentation',
-     u'Advanced Micro Devices', 'manual'),
-]
-
-# -- Options for manual page output ---------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [(master_doc, 'rocfft', u'rocFFT Documentation', [author], 1)]
-
-# -- Options for Texinfo output -------------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-#  dir menu entry, description, category)
-texinfo_documents = [
-    (master_doc, 'rocFFT', u'rocFFT Documentation', author, 'rocFFT',
-     'One line description of project.', 'Miscellaneous'),
-]
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/source/images/realfft_1dlen.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/source/images/realfft_1dlen.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/source/images/realfft_ex_n7.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/source/images/realfft_ex_n7.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/source/images/realfft_ex_n8.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/source/images/realfft_ex_n8.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/source/images/realfft_expl_01.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/source/images/realfft_expl_01.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/source/images/realfft_expl_02.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/source/images/realfft_expl_02.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/source/images/realfft_expl_03.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/source/images/realfft_expl_03.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/source/images/realfft_expl_04.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/source/images/realfft_expl_04.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/source/images/realfft_expl_05.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/source/images/realfft_expl_05.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/source/images/realfft_expl_06.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/source/images/realfft_expl_06.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/source/images/realfft_expl_07.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/source/images/realfft_expl_07.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/source/images/realfft_expl_08.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/source/images/realfft_expl_08.jpg differ
Binary files /tmp/tmpblbzjs4u/7vOfbKreng/rocfft-5.5.0/docs/source/images/realfft_fwdinv.jpg and /tmp/tmpblbzjs4u/2fYVTKatDG/rocfft-5.7.1/docs/source/images/realfft_fwdinv.jpg differ
diff -Nru rocfft-5.5.0/docs/source/index.rst rocfft-5.7.1/docs/source/index.rst
--- rocfft-5.5.0/docs/source/index.rst	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/docs/source/index.rst	1970-01-01 00:00:00.000000000 +0000
@@ -1,21 +0,0 @@
-.. rocFFT documentation master file, created by
-   sphinx-quickstart on Mon Jan  8 09:51:41 2018.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
-Welcome to rocFFT's documentation!
-==================================
-
-.. toctree::
-   :maxdepth: 4 
-   :caption: Contents:
-
-   library 
-   api
-   allapi
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`search`
diff -Nru rocfft-5.5.0/docs/source/library.rst rocfft-5.7.1/docs/source/library.rst
--- rocfft-5.5.0/docs/source/library.rst	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/docs/source/library.rst	1970-01-01 00:00:00.000000000 +0000
@@ -1,428 +0,0 @@
-
-.. toctree::
-   :maxdepth: 4 
-   :caption: Contents:
-
-======
-rocFFT
-======
-
-Introduction
-------------
-
-The rocFFT library is an implementation of the discrete Fast Fourier Transform (FFT) written in HIP for GPU devices.
-The code is open and hosted here: https://github.com/ROCmSoftwarePlatform/rocFFT
-
-The rocFFT library:
-
-* Provides a fast and accurate platform for calculating discrete FFTs.
-* Supports single and double precision floating point formats.
-* Supports 1D, 2D, and 3D transforms.
-* Supports computation of transforms in batches.
-* Supports real and complex FFTs.
-* Supports arbitrary lengths, with optimizations for combinations of
-  powers of 2, 3, and 5.
-
-FFT Computation
----------------
-
-The FFT is an implementation of the Discrete Fourier Transform (DFT) that makes use of symmetries in the DFT definition to
-reduce the mathematical complexity from :math:`O(N^2)` to :math:`O(N \log N)`.
-
-What is computed by the library? Here are the formulas:
-
-For a 1D complex DFT:
-
-:math:`{\tilde{x}}_j = \sum_{k=0}^{n-1}x_k\exp\left({\pm i}{{2\pi jk}\over{n}}\right)\hbox{ for } j=0,1,\ldots,n-1`
-
-where, :math:`x_k` are the complex data to be transformed, :math:`\tilde{x}_j` are the transformed data, and the sign :math:`\pm`
-determines the direction of the transform: :math:`-` for forward and :math:`+` for backward.
-
-For a 2D complex DFT:
-
-:math:`{\tilde{x}}_{jk} = \sum_{q=0}^{m-1}\sum_{r=0}^{n-1}x_{rq}\exp\left({\pm i} {{2\pi jr}\over{n}}\right)\exp\left({\pm i}{{2\pi kq}\over{m}}\right)`
-
-for :math:`j=0,1,\ldots,n-1\hbox{ and } k=0,1,\ldots,m-1`, where, :math:`x_{rq}` are the complex data to be transformed,
-:math:`\tilde{x}_{jk}` are the transformed data, and the sign :math:`\pm` determines the direction of the transform.
-
-For a 3D complex DFT:
-
-:math:`\tilde{x}_{jkl} = \sum_{s=0}^{p-1}\sum_{q=0}^{m-1}\sum_{r=0}^{n-1}x_{rqs}\exp\left({\pm i} {{2\pi jr}\over{n}}\right)\exp\left({\pm i}{{2\pi kq}\over{m}}\right)\exp\left({\pm i}{{2\pi ls}\over{p}}\right)`
-
-for :math:`j=0,1,\ldots,n-1\hbox{ and } k=0,1,\ldots,m-1\hbox{ and } l=0,1,\ldots,p-1`, where :math:`x_{rqs}` are the complex data to
-be transformed, :math:`\tilde{x}_{jkl}` are the transformed data, and the sign :math:`\pm` determines the direction of the transform.
-
-Library Setup and Cleanup
--------------------------
-
-At the beginning of the program, before any of the library APIs are called, the function :cpp:func:`rocfft_setup` has to be called. Similarly,
-the function :cpp:func:`rocfft_cleanup` has to be called at the end of the program. These APIs ensure resources are properly allocated and freed.
-
-Workflow
---------
-
-In order to compute an FFT with rocFFT, a plan has to be created first. A plan is a handle to an internal data structure that
-holds the details about the transform that the user wishes to compute. After the plan is created, it can be executed (a separate API call)
-with the specified data buffers. The execution step can be repeated any number of times with the same plan on different input/output buffers
-as needed. And when the plan is no longer needed, it gets destroyed.
-
-To do a transform,
-
-#. Initialize the library by calling :cpp:func:`rocfft_setup()`.
-#. Create a plan, for each distinct type of FFT needed:
-
-   * To create a plan, do either of the following
-
-     * If the plan specification is simple, call :cpp:func:`rocfft_plan_create` and specify the value of the fundamental parameters.
-     * If the plan has more details, first a plan description is created with :cpp:func:`rocfft_plan_description_create`, and additional APIs such
-       as :cpp:func:`rocfft_plan_description_set_data_layout` are called to specify plan details. And then, :cpp:func:`rocfft_plan_create` is called
-       with the description handle passed to it along with other details.
-
-   * Optionally, allocate a work buffer for the plan:
-
-     * Call :cpp:func:`rocfft_plan_get_work_buffer_size` to check the size of work buffer required by the plan.
-     * If a nonzero size is required:
-
-       * Create an execution info object with :cpp:func:`rocfft_execution_info_create`.
-       * Allocate a buffer using :cpp:func:`hipMalloc` and pass the allocated buffer to :cpp:func:`rocfft_execution_info_set_work_buffer`.
-
-#. Execute the plan:
-
-   * The execution API :cpp:func:`rocfft_execute` is used to do the actual computation on the data buffers specified.
-   * Extra execution information such as work buffers and compute streams are passed to :cpp:func:`rocfft_execute` in the :cpp:type:`rocfft_execution_info` object.
-   * :cpp:func:`rocfft_execute` can be called repeatedly as needed for different data, with the same plan.
-   * If the plan requires a work buffer but none was provided, :cpp:func:`rocfft_execute` will automatically allocate a work buffer and free it when execution is finished.
-
-#. If a work buffer was allocated:
-
-   * Call :cpp:func:`hipFree` to free the work buffer.
-   * Call :cpp:func:`rocfft_execution_info_destroy` to destroy the execution info object.
-
-#. Destroy the plan by calling :cpp:func:`rocfft_plan_destroy`.
-#. Terminate the library by calling :cpp:func:`rocfft_cleanup()`.
-
-
-Example
--------
-
-.. code-block:: c
-
-   #include <iostream>
-   #include <vector>
-   #include "hip/hip_runtime_api.h"
-   #include "hip/hip_vector_types.h"
-   #include "rocfft.h"
-   
-   int main()
-   {
-           // rocFFT gpu compute
-           // ========================================
-  
-           rocfft_setup();
-
-           size_t N = 16;
-           size_t Nbytes = N * sizeof(float2);
-   
-           // Create HIP device buffer
-           float2 *x;
-           hipMalloc(&x, Nbytes);
-   
-           // Initialize data
-           std::vector<float2> cx(N);
-           for (size_t i = 0; i < N; i++)
-           {
-                   cx[i].x = 1;
-                   cx[i].y = -1;
-           }
-   
-           //  Copy data to device
-           hipMemcpy(x, cx.data(), Nbytes, hipMemcpyHostToDevice);
-   
-           // Create rocFFT plan
-           rocfft_plan plan = nullptr;
-           size_t length = N;
-           rocfft_plan_create(&plan, rocfft_placement_inplace,
-                rocfft_transform_type_complex_forward, rocfft_precision_single,
-                1, &length, 1, nullptr);
-
-	   // Check if the plan requires a work buffer
-	   size_t work_buf_size = 0;
-	   rocfft_plan_get_work_buffer_size(plan, &work_buf_size);
-	   void* work_buf = nullptr;
-	   rocfft_execution_info info = nullptr;
-	   if(work_buf_size)
-           {
-                   rocfft_execution_info_create(&info);
-		   hipMalloc(&work_buf, work_buf_size);
-		   rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size);
-           }
-   
-           // Execute plan
-           rocfft_execute(plan, (void**) &x, nullptr, info);
-   
-           // Wait for execution to finish
-           hipDeviceSynchronize();
-
-	   // Clean up work buffer
-	   if(work_buf_size)
-	   {
-	           hipFree(work_buf);
-		   rocfft_execution_info_destroy(info);
-	   }
-
-           // Destroy plan
-           rocfft_plan_destroy(plan);
-   
-           // Copy result back to host
-           std::vector<float2> y(N);
-           hipMemcpy(y.data(), x, Nbytes, hipMemcpyDeviceToHost);
-   
-           // Print results
-           for (size_t i = 0; i < N; i++)
-           {
-                   std::cout << y[i].x << ", " << y[i].y << std::endl;
-           }
-   
-           // Free device buffer
-           hipFree(x);
-   
-           rocfft_cleanup();
-
-           return 0;
-   }
-
-Plans
------
-
-A plan is the collection of (almost) all the parameters needed to specify an FFT computation. A rocFFT plan includes the
-following information:
-
-* Type of transform (complex or real)
-* Dimension of the transform (1D, 2D, or 3D)
-* Length or extent of data in each dimension
-* Number of datasets that are transformed (batch size)
-* Floating-point precision of the data
-* In-place or not in-place transform
-* Format (array type) of the input/output buffer
-* Layout of data in the input/output buffer 
-* Scaling factor to apply to the output of the transform
-
-The rocFFT plan does not include the following parameters:
-
-* The handles to the input and output data buffers.
-* The handle to a temporary work buffer (if needed).
-* Other information to control execution on the device.
-
-These parameters are specified when the plan is executed.
-
-Data
-----
-
-The input/output buffers that hold the data for the transform must be allocated, initialized and specified to the library by the
-user. For larger transforms, temporary work buffers may be needed. Because the library tries to minimize its own allocation of
-memory regions on the device, it expects the user to manage work buffers. The size of the buffer needed can be queried using
-:cpp:func:`rocfft_plan_get_work_buffer_size` and after their allocation can be passed to the library by
-:cpp:func:`rocfft_execution_info_set_work_buffer`. The samples in the source repository show how to use these.
-
-Transform and Array types 
--------------------------
-
-There are two main types of FFTs in the library:
-
-* Complex FFT - Transformation of complex data (forward or backward); the library supports the following two
-  array types to store complex numbers:
-
-  #. Planar format - where the real and imaginary components are kept in 2 separate arrays:
-
-     * Buffer1: ``RRRRR...`` 
-     * Buffer2: ``IIIII...``
-  #. Interleaved format - where the real and imaginary components are stored as contiguous pairs in the same array: 
-
-     * Buffer: ``RIRIRIRIRIRI...``
-  
-* Real FFT - Transformation of real data. For transforms involving real data, there are two possibilities:
-
-  * Real data being subject to forward FFT that results in complex data (Hermitian).
-  * Complex data (Hermitian) being subject to backward FFT that results in real data.
-
-.. note::
-
-   Real backward FFTs require that the input data be
-   Hermitian-symmetric, as would naturally happen in the output of a
-   real forward FFT.  rocFFT will produce undefined results if
-   this requirement is not met.
-
-The library provides the :cpp:enum:`rocfft_transform_type` and
-:cpp:enum:`rocfft_array_type` enums to specify transform and array
-types, respectively.
-
-Batches
--------
-
-The efficiency of the library is improved by utilizing transforms in batches. Sending as much data as possible in a single
-transform call leverages the parallel compute capabilities of devices (GPU devices in particular), and minimizes the penalty
-of control transfer. It is best to think of a device as a high-throughput, high-latency device. Using a networking analogy as
-an example, this approach is similar to having a massively high-bandwidth pipe with very high ping response times. If the client
-is ready to send data to the device for compute, it should be sent in as few API calls as possible, and this can be done by batching.
-rocFFT plans have a parameter `number_of_transforms` (this value is also referred to as batch size in various places in the document)
-in :cpp:func:`rocfft_plan_create` to describe the number of transforms being requested. All 1D, 2D, and 3D transforms can be batched.
-
-.. _resultplacement:
-
-Result placement
-----------------
-
-The API supports both in-place and not in-place transforms via the :cpp:enum:`rocfft_result_placement` enum.  With in-place transforms, only input buffers are provided to the
-execution API, and the resulting data is written to the same buffer, overwriting the input data.  With not in-place transforms, distinct
-output buffers are provided, and the results are written into the output buffer.
-
-Note that rocFFT may still modify the input buffer even if a transform is requested to be not in-place.  Real-complex transforms in particular are more efficient if they can modify the original input.
-
-Strides and Distances
----------------------
-
-Strides and distances enable users to specify custom layout of data using :cpp:func:`rocfft_plan_description_set_data_layout`.
-
-For 1D data, if :cpp:expr:`strides[0] == strideX == 1`, successive elements in the first dimension (dimension index 0) are stored
-contiguously in memory. If :cpp:expr:`strideX` is a value greater than 1, gaps in memory exist between each element of the vector.
-For multi-dimensional cases; if :cpp:expr:`strides[1] == strideY == LenX` for 2D data and :cpp:expr:`strides[2] == strideZ == LenX * LenY` for 3D data,
-no gaps exist in memory between each element, and all vectors are stored tightly packed in memory. Here, :cpp:expr:`LenX`, :cpp:expr:`LenY`, and :cpp:expr:`LenZ` denote the
-transform lengths :cpp:expr:`lengths[0]`, :cpp:expr:`lengths[1]`, and :cpp:expr:`lengths[2]`, respectively, which are used to set up the plan.
-
-Distance is the stride that exists between corresponding elements of successive FFT data instances (primitives) in a batch. Distance is measured in units of the memory type;
-complex data measures in complex units, and real data measures in real units. For tightly packed data, the distance between FFT primitives is the size of the FFT primitive,
-such that :cpp:expr:`dist == LenX` for 1D data, :cpp:expr:`dist == LenX * LenY` for 2D data, and :cpp:expr:`dist == LenX * LenY * LenZ` for 3D data. It is possible to set the distance of a plan to be less than the size
-of the FFT vector; typically 1 when doing column (strided) access on packed data. When computing a batch of 1D FFT vectors, if :cpp:expr:`distance == 1`, and :cpp:expr:`strideX == length(vector)`,
-it means data for each logical FFT is read along columns (in this case along the batch). You must verify that the distance and strides are valid, such that each logical
-FFT instance is not overlapping with any other; if not valid, undefined results may occur. A simple example would be to perform a 1D length 4096 on each row of an array of
-1024 rows x 4096 columns of values stored in a column-major array, such as a FORTRAN program might provide. (This would be equivalent to a C or C++ program that has an
-array of 4096 rows x 1024 columns stored in a row-major manner, on which you want to perform a 1D length 4096 transform on each column.) In this case, specify the
-strides as [1024] and distance as 1.
-
-Overwriting non-contiguous buffers
-==================================
-
-rocFFT guarantees that both the reading of FFT input and the writing of FFT output will respect the
-specified strides.  However, temporary results can potentially be written to these buffers
-contiguously, which may be unexpected if the strides would avoid certain memory locations completely
-for reading and writing.
-
-For example, a 1D FFT of length :math:`N` with input and output stride of 2 is transforming only
-even-indexed elements in the input and output buffers.  But if temporary data needs to be written to
-the buffers, odd-indexed elements may be overwritten.
-
-However, rocFFT is guaranteed to respect the size of buffers.  In the above example, the
-input/output buffers are :math:`2N` elements long, even if only :math:`N` even-indexed
-elements are being transformed.  No more than :math:`2N` elements of temporary data will be written
-to the buffers during the transform.
-
-These policies apply to both input and output buffers, because :ref:`not in-place transforms may overwrite input data<resultplacement>`.
-
-Transforms of real data
------------------------
-
-.. toctree::
-   :maxdepth: 2
-
-   real
-
-Result Scaling
---------------
-
-The output of a forward or backward FFT often needs to be multiplied
-by a scaling factor before the data can be passed to the next step of
-a computation.  While users of rocFFT can launch a separate GPU
-kernel to do this work, rocFFT provides a
-:cpp:func:`rocfft_plan_description_set_scale_factor` function to more
-efficiently combine this scaling multiplication with the FFT work.
-
-The scaling factor is set on the plan description prior to plan creation.
-
-Load and Store Callbacks
-------------------------
-
-rocFFT includes experimental functionality to call user-defined device functions
-when loading input from global memory at the start of a transform, or
-when storing output to global memory at the end of a transform.
-
-These user-defined callback functions may be optionally supplied
-to the library using
-:cpp:func:`rocfft_execution_info_set_load_callback` and
-:cpp:func:`rocfft_execution_info_set_store_callback`.
-
-Device functions supplied as callbacks must load and store element
-data types that are appropriate for the transform being performed.
-
-+-------------------------+--------------------+----------------------+
-|Transform type           | Load element type  | Store element type   |
-+=========================+====================+======================+
-|Complex-to-complex,      | `float2`           | `float2`             |
-|single-precision         |                    |                      |
-+-------------------------+--------------------+----------------------+
-|Complex-to-complex,      | `double2`          | `double2`            |
-|double-precision         |                    |                      |
-+-------------------------+--------------------+----------------------+
-|Real-to-complex,         | `float`            | `float2`             |
-|single-precision         |                    |                      |
-+-------------------------+--------------------+----------------------+
-|Real-to-complex,         | `double`           | `double2`            |
-|double-precision         |                    |                      |
-+-------------------------+--------------------+----------------------+
-|Complex-to-real,         | `float2`           | `float`              |
-|single-precision         |                    |                      |
-+-------------------------+--------------------+----------------------+
-|Complex-to-real,         | `double2`          | `double`             |
-|double-precision         |                    |                      |
-+-------------------------+--------------------+----------------------+
-
-The callback function signatures must match the specifications
-below.
-
-.. code-block:: c
-
-  T load_callback(T* buffer, size_t offset, void* callback_data, void* shared_memory);
-  void store_callback(T* buffer, size_t offset, T element, void* callback_data, void* shared_memory);
-
-The parameters for the functions are defined as:
-
-* `T`: The data type of each element being loaded or stored from the
-  input or output.
-* `buffer`: Pointer to the input (for load callbacks) or
-  output (for store callbacks) in device memory that was passed to
-  :cpp:func:`rocfft_execute`.
-* `offset`: The offset of the location being read from or written
-  to.  This counts in elements, from the `buffer` pointer.
-* `element`: For store callbacks only, the element to be stored.
-* `callback_data`: A pointer value accepted by
-  :cpp:func:`rocfft_execution_info_set_load_callback` and
-  :cpp:func:`rocfft_execution_info_set_store_callback` which is passed
-  through to the callback function.
-* `shared_memory`: A pointer to an amount of shared memory requested
-  when the callback is set.  Currently, shared memory is not supported
-  and this parameter is always null.
-
-Callback functions are called exactly once for each element being
-loaded or stored in a transform.  Note that multiple kernels may be
-launched to decompose a transform, which means that separate kernels
-may call the load and store callbacks for a transform if both are
-specified.
-
-Currently, callbacks functions are only supported for transforms that
-do not use planar format for input or output.
-
-Runtime compilation
--------------------
-
-rocFFT includes many kernels for common FFT problems.  Some plans may
-require additional kernels aside from what is built in to the
-library.  In these cases, rocFFT will compile optimized kernels for
-the plan when the plan is created.
-
-Compiled kernels are stored in memory by default and will be reused
-if they are required again for plans in the same process.
-
-If the ``ROCFFT_RTC_CACHE_PATH`` environment variable is set to a
-writable file location, rocFFT will write compiled kernels to this
-location.  rocFFT will read kernels from this location for plans in
-other processes that need runtime-compiled kernels.  rocFFT will
-create the specified file if it does not already exist.
diff -Nru rocfft-5.5.0/docs/source/real.rst rocfft-5.7.1/docs/source/real.rst
--- rocfft-5.5.0/docs/source/real.rst	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/docs/source/real.rst	1970-01-01 00:00:00.000000000 +0000
@@ -1,151 +0,0 @@
-
-.. toctree::
-   :maxdepth: 2 
-   :caption: Contents:
-
-Real data
----------
-
-When real data is subject to DFT, the resulting complex output data follows a special property. About half of the
-output is redundant because they are complex conjugates of the other half. This is called the Hermitian redundancy. So, for space
-and performance considerations, it is only necessary to store the non-redundant part of the data. Most FFT libraries use this property to
-offer specific storage layouts for FFTs involving real data. rocFFT
-provides three enumeration values for :cpp:enum:`rocfft_array_type` to deal with real data FFTs:
-
-* REAL (:cpp:enumerator:`rocfft_array_type_real`)
-* HERMITIAN_INTERLEAVED (:cpp:enumerator:`rocfft_array_type_hermitian_interleaved`)
-* HERMITIAN_PLANAR (:cpp:enumerator:`rocfft_array_type_hermitian_planar`)
-
-The REAL (:cpp:enumerator:`rocfft_array_type_real`) enum specifies that the data is purely real. This can be used to feed real input or get back real output. The
-HERMITIAN_INTERLEAVED
-(:cpp:enumerator:`rocfft_array_type_hermitian_interleaved`) and HERMITIAN_PLANAR (:cpp:enumerator:`rocfft_array_type_hermitian_planar`) enums are similar to the corresponding full complex enums in the way
-they store real and imaginary components, but store only about half of the complex output. Client applications can do just a
-forward transform and analyze the output or they can process the output and do a backward transform to get back real data.
-This is illustrated in the following figure.
-
-.. figure:: ./images/realfft_fwdinv.jpg
-
-   **Forward and Backward Real FFTs**
-
-.. note::
-
-   Real backward FFTs require that the input data be
-   Hermitian-symmetric, as would naturally happen in the output of a
-   real forward FFT.  rocFFT will produce undefined results if
-   this requirement is not met.
-
-Let us consider a 1D real FFT of length :math:`N`. The full output looks as shown in following figure.
-
-.. figure:: ./images/realfft_1dlen.jpg
-
-   **1D Real FFT of Length N**
-
-Here, C* denotes the complex conjugate. Since the values at indices greater than :math:`N/2` can be deduced from the first half
-of the array, rocFFT stores data only up to the index :math:`N/2`. This means that the output contains only :math:`1 + N/2` complex
-elements, where the division :math:`N/2` is rounded down. Examples for even and odd lengths are given below.
-
-Example for :math:`N = 8` is shown in following figure.
-
-.. figure:: ./images/realfft_ex_n8.jpg
-
-   **Example for N = 8**
-
-Example for :math:`N = 7` is shown in following figure.
-
-.. figure:: ./images/realfft_ex_n7.jpg
-
-   **Example for N = 7**
-
-For length 8, only :math:`(1 + 8/2) = 5` of the output complex numbers are stored, with the index ranging from 0 through 4.
-Similarly for length 7, only :math:`(1 + 7/2) = 4` of the output complex numbers are stored, with the index ranging from 0 through 3.
-For 2D and 3D FFTs, the FFT length along the innermost dimension is used to compute the :math:`(1 + N/2)` value. This is because
-the FFT along the innermost dimension is computed first and is logically a real-to-hermitian transform. The FFTs along
-other dimensions are computed next, and they are simply 'complex-to-complex' transforms. For example, assuming :math:`Lengths[2]`
-is used to set up a 2D real FFT, let :math:`N1 = Lengths[1]`, and :math:`N0 = Lengths[0]`. The output FFT has :math:`N1*(1 + N0/2)` complex elements.
-Similarly, for a 3D FFT with :math:`Lengths[3]` and :math:`N2 = Lengths[2]`, :math:`N1 = Lengths[1]`, and :math:`N0 = Lengths[0]`, the output has :math:`N2*N1*(1 + N0/2)`
-complex elements.
-
-Supported array type combinations
----------------------------------
-
-Not In-place transforms:
-
-* Forward:  REAL to HERMITIAN_INTERLEAVED
-* Forward:  REAL to HERMITIAN_PLANAR
-* Backward: HERMITIAN_INTERLEAVED to REAL
-* Backward: HERMITIAN_PLANAR to REAL
-
-In-place transforms:
-
-* Forward:  REAL to HERMITIAN_INTERLEAVED
-* Backward: HERMITIAN_INTERLEAVED to REAL
-
-Setting strides
----------------
-
-The library currently requires the user to explicitly set input and output strides for real transforms for non simple cases.
-See the following examples to understand what values to use for input and output strides under different scenarios. These examples show
-typical usages, but the user can allocate the buffers and choose data layout according to their need.
-
-Examples
---------
-
-The following figures and examples explain in detail the real FFT features of this library.
-
-Here is a schematic that illustrates the forward 1D FFT (real to hermitian).
-
-.. figure:: ./images/realfft_expl_01.jpg
-
-   **1D FFT - Real to Hermitian**
-
-Below is a schematic that shows an example of not in-place transform with even :math:`N` and how strides and distances are set.
-
-.. figure:: ./images/realfft_expl_02.jpg
-
-   **1D FFT - Real to Hermitian, Example 1**
-
-Below is a schematic that shows an example of in-place transform with even :math:`N` and how strides and distances are set.
-Notice that even though we are dealing with only 1 buffer (in-place), the output strides/distance can take different
-values compared to input strides/distance.
-
-.. figure:: ./images/realfft_expl_03.jpg
-
-   **1D FFT - Real to Hermitian, Example 2**
-
-Below is a schematic that shows an example of in-place transform with odd :math:`N` and how strides and distances are set.
-Notice that even though we are dealing with only 1 buffer (in-place), the output strides/distance can take different
-values compared to input strides/distance.
-
-.. figure:: ./images/realfft_expl_04.jpg
-
-   **1D FFT - Real to Hermitian, Example 3**
-
-And here is a schematic that illustrates the backward 1D FFT (hermitian to real).
-
-.. figure:: ./images/realfft_expl_05.jpg
-
-   **1D FFT - Hermitian to Real**
-
-Below is a schematic that shows an example of in-place transform with even :math:`N` and how strides and distances are set.
-Notice that even though we are dealing with only 1 buffer (in-place), the output strides/distance can take different
-values compared to input strides/distance.
-
-.. figure:: ./images/realfft_expl_06.jpg
-
-   **1D FFT - Hermitian to Real, Example**
-
-And here is a schematic that illustrates the in-place forward 2D FFT (real to hermitian) .
-
-.. figure:: ./images/realfft_expl_07.jpg
-
-   **2D FFT - Real to Hermitian In Place**
-
-Below is a schematic that shows an example of in-place 2D transform and how strides and distances are set.
-Notice that even though we are dealing with only 1 buffer (in-place), the output strides/distance can take different
-values compared to input strides/distance.
-
-.. figure:: ./images/realfft_expl_08.jpg
-
-   **2D FFT - Real to Hermitian, Example**
-
-
diff -Nru rocfft-5.5.0/docs/source/requirements.txt rocfft-5.7.1/docs/source/requirements.txt
--- rocfft-5.5.0/docs/source/requirements.txt	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/docs/source/requirements.txt	1970-01-01 00:00:00.000000000 +0000
@@ -1,3 +0,0 @@
-
-breathe
-
diff -Nru rocfft-5.5.0/install.sh rocfft-5.7.1/install.sh
--- rocfft-5.5.0/install.sh	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/install.sh	2023-08-09 16:19:51.000000000 +0000
@@ -36,6 +36,7 @@
     echo "    [-d|--dependencies] install build dependencies"
     echo "    [-c|--clients] build library clients too (combines with -i & -d)"
     echo "    [-g|--debug] -DCMAKE_BUILD_TYPE=Debug (default is =Release)"
+    echo "    [-t|--tuner] --DROCFFT_BUILD_OFFLINE_TUNER=ON (default is =Off)"
     echo "    [-r]--relocatable] create a package to support relocatable ROCm"
     #echo "    [--cuda] build library for cuda backend"
     echo "    [--hip-clang] build library for amdgpu backend using hip-clang"
@@ -45,6 +46,8 @@
     echo "    [--gen-group-num] Specify the group numbers of generated kernel files"
     echo "    [--manual-small] Additional small sizes list to generate, ='A[,B,C,..]', default is empty "
     echo "    [--manual-large] Additional large sizes list to generate, ='A[,B,C,..]', default is empty "
+    echo "    [--solmap-folder] Specify the folder (abs-path) of solution-map that would be built into library, "
+    echo "                      default is [repo-folder]/solution_map/"
     echo "    [--address-sanitizer] build with address sanitizer enabled"
     echo "    [--rm-legacy-include-dir] Remove legacy include dir Packaging added for file/folder reorg backward compatibility."
 }
@@ -268,8 +271,8 @@
 install_prefix=rocfft-install
 build_clients=false
 build_release=true
+build_tuner=false
 build_relocatable=false
-build_hip_clang=true
 pattern_arg=false
 precision_arg=false
 group_num=false
@@ -277,6 +280,7 @@
 manual_large_arg=false
 build_address_sanitizer=false
 build_freorg_bkwdcomp=true
+solmap_data_folder=false
 
 # #################################################
 # Parameter parsing
@@ -285,7 +289,7 @@
 # check if we have a modern version of getopt that can handle whitespace and long parameters
 getopt -T
 if [[ $? -eq 4 ]]; then
-    GETOPT_PARSE=$(getopt --name "${0}" -o 'hidcgr' --long 'help,install,clients,dependencies,debug,hip-clang,prefix:,relocatable,gen-pattern:,gen-precision:,gen-group-num:,manual-small:,manual-large:,address-sanitizer, rm-legacy-include-dir' --options hicgdr -- "$@")
+    GETOPT_PARSE=$(getopt --name "${0}" -o 'hidcgrt' --long 'help,install,clients,dependencies,debug,tuner,hip-clang,prefix:,relocatable,gen-pattern:,gen-precision:,gen-group-num:,manual-small:,manual-large:,solmap-folder:,address-sanitizer, rm-legacy-include-dir' --options hicgdrt -- "$@")
 else
     echo "Need a new version of getopt"
     exit 1
@@ -319,8 +323,8 @@
         -g|--debug)
             build_release=false
             shift ;;
-        --hip-clang)
-            build_hip_clang=true
+        -t|--tuner)
+            build_tuner=true
             shift ;;
         --address-sanitizer)
             build_address_sanitizer=true
@@ -352,6 +356,10 @@
             # echo $2
             manual_large_arg=${2}
             shift 2 ;;
+        --solmap-folder)
+            # echo $2
+            solmap_data_folder=${2}
+            shift 2 ;;
         --) shift ; break ;;
         *)  echo "Unexpected command line parameter received; aborting";
             exit 1
@@ -457,6 +465,13 @@
     cmake_common_options="${cmake_common_options} -DGENERATOR_MANUAL_LARGE_SIZE=${manual_large_arg}"
 fi
 
+if [[ "${build_tuner}" != false ]]; then
+    cmake_common_options="${cmake_common_options} -DROCFFT_BUILD_OFFLINE_TUNER=ON"
+fi
+
+if [[ "${solmap_data_folder}" != false ]]; then
+    cmake_common_options="${cmake_common_options} -DSOLUTION_MAP_DATABASE_FOLDER=${solmap_data_folder}"
+fi
 
 # build type
 if [[ "${build_release}" == true ]]; then
@@ -473,13 +488,6 @@
 fi
 
 compiler="hipcc"
-if [[ "${build_hip_clang}" == true ]]; then
-    compiler="hipcc"
-fi
-
-if [[ "${build_hip_clang}" == true ]]; then
-    cmake_common_options="${cmake_common_options} -DUSE_HIP_CLANG=ON -DHIP_COMPILER=clang"
-fi
 
 # Build library with AMD toolchain because of existense of device kernels
 if [[ "${build_clients}" == false ]]; then
diff -Nru rocfft-5.5.0/library/CMakeLists.txt rocfft-5.7.1/library/CMakeLists.txt
--- rocfft-5.5.0/library/CMakeLists.txt	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/CMakeLists.txt	2023-08-09 16:19:51.000000000 +0000
@@ -35,7 +35,7 @@
 if (BUILD_CPUREF)
   set(CMAKE_CXX_FLAGS_DEBUG "-DREF_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}")
 endif()
-  
+
 # Print out compiler flags for viewing/debug
 if( BUILD_VERBOSE )
   message( STATUS "rocfft_VERSION: ${rocfft_VERSION}" )
@@ -67,7 +67,7 @@
 configure_file( "${CMAKE_CURRENT_SOURCE_DIR}/include/rocfft.h" "${PROJECT_BINARY_DIR}/include/rocfft/rocfft.h" COPYONLY )
 
 set( rocfft_headers_public
-  include/rocfft.h
+  ${PROJECT_BINARY_DIR}/include/rocfft/rocfft.h
   ${PROJECT_BINARY_DIR}/include/rocfft/rocfft-version.h
 )
 
diff -Nru rocfft-5.5.0/library/include/rocfft-version.h.in rocfft-5.7.1/library/include/rocfft-version.h.in
--- rocfft-5.5.0/library/include/rocfft-version.h.in	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/include/rocfft-version.h.in	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 /******************************************************************************
-* Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+* Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -22,8 +22,8 @@
 
 /* the configured version and settings
  */
-#ifndef ROCFFT_VERSION_H_
-#define ROCFFT_VERSION_H_
+#ifndef ROCFFT_VERSION_H
+#define ROCFFT_VERSION_H
 
 // clang-format off
 #define rocfft_version_major      @rocfft_VERSION_MAJOR@
@@ -32,4 +32,4 @@
 #define rocfft_version_tweak      @rocfft_VERSION_TWEAK@
 // clang-format on
 
-#endif
+#endif /* ROCFFT_VERSION_H */
diff -Nru rocfft-5.5.0/library/include/rocfft.h rocfft-5.7.1/library/include/rocfft.h
--- rocfft-5.5.0/library/include/rocfft.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/include/rocfft.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 /******************************************************************************
-* Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+* Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -24,8 +24,8 @@
  *  rocfft.h defines all the public interfaces and types
  *  */
 
-#ifndef __ROCFFT_H__
-#define __ROCFFT_H__
+#ifndef ROCFFT_H
+#define ROCFFT_H
 
 #ifdef rocfft_EXPORTS
 #include "rocfft-export.h"
@@ -84,6 +84,7 @@
 {
     rocfft_precision_single,
     rocfft_precision_double,
+    rocfft_precision_half,
 } rocfft_precision;
 
 /*! @brief Result placement
@@ -213,7 +214,7 @@
  *  @details rocFFT multiplies each element of the result by the given factor at the end of the transform.
  *
  *  The supplied factor must be a finite number.  That is, it must neither be infinity nor NaN.
- * 
+ *
  *  @param[in] description description handle
  *  @param[in] scale scaling factor
  *  */
@@ -222,10 +223,10 @@
 
 /*!
  *  @brief Set advanced data layout parameters on a plan description
- * 
+ *
  *  @details This API specifies advanced layout of input/output
  *  buffers for a plan description.
- * 
+ *
  *  The following parameters are supported for inputs and outputs:
  *
  *  * Array type (real, hermitian, or complex data, in either
@@ -508,8 +509,18 @@
 ROCFFT_EXPORT rocfft_status rocfft_cache_deserialize(const void* buffer, size_t buffer_len_bytes);
 #endif
 
+#ifdef ROCFFT_BUILD_OFFLINE_TUNER
+/*! @brief Get a handler of offline-tuner
+
+ *  @details This is for developers only, so the actual type is not
+ *  public to API yet. This API is used only in our standalone executable.
+ *  Must be called after rocfft_setup. And de-references the return handle to get
+ *  the tuner-pointer */
+ROCFFT_EXPORT rocfft_status rocfft_get_offline_tuner_handle(void** offline_tuner);
+#endif
+
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */
 
-#endif /* __ROCFFT_H__ */
+#endif /* ROCFFT_H */
diff -Nru rocfft-5.5.0/library/src/CMakeLists.txt rocfft-5.7.1/library/src/CMakeLists.txt
--- rocfft-5.5.0/library/src/CMakeLists.txt	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/CMakeLists.txt	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 # #############################################################################
-# Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -50,7 +50,7 @@
   endif()
 endif()
 
-set( package_targets rocfft rocfft_rtc_helper )
+set( package_targets rocfft )
 target_include_directories( rocfft_rtc_helper
   PRIVATE
   $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/library/src/include>
@@ -80,6 +80,7 @@
 
 # files that need to be embedded into the library, to be able to generate code
 set( kgen_embed_files
+     ${CMAKE_SOURCE_DIR}/shared/rocfft_complex.h
      ${CMAKE_SOURCE_DIR}/library/src/device/kernels/common.h
      ${CMAKE_SOURCE_DIR}/library/src/device/kernels/memory_gfx.h
      ${CMAKE_SOURCE_DIR}/library/src/device/kernels/callback.h
@@ -99,7 +100,6 @@
      ${CMAKE_SOURCE_DIR}/library/src/device/generator/rtc_radix_functions/radix_13.h
      ${CMAKE_SOURCE_DIR}/library/src/device/generator/rtc_radix_functions/radix_16.h
      ${CMAKE_SOURCE_DIR}/library/src/device/generator/rtc_radix_functions/radix_17.h
-     ${CMAKE_SOURCE_DIR}/library/src/device/generator/rtc_workarounds.h
    )
 
 # files that contribute to the logic of how code gets generated -
@@ -107,6 +107,9 @@
 # to serve as a "version" for the code generator.
 set( kgen_logic_files
 
+     # Complex number datatype
+     ${CMAKE_SOURCE_DIR}/shared/rocfft_complex.h
+
      # python code that does the embedding
      ${CMAKE_SOURCE_DIR}/library/src/device/kernel-generator-embed-cpp.py
 
@@ -128,6 +131,7 @@
      ${CMAKE_SOURCE_DIR}/library/src/device/generator/stockham_gen_cr.h
      ${CMAKE_SOURCE_DIR}/library/src/device/generator/stockham_gen_rc.h
      ${CMAKE_SOURCE_DIR}/library/src/device/generator/stockham_gen_rr.h
+     ${CMAKE_SOURCE_DIR}/library/src/device/generator/bluestein_generator.h     
      ${CMAKE_SOURCE_DIR}/library/src/rtc_compile.cpp
      ${CMAKE_SOURCE_DIR}/library/src/include/rtc_stockham_gen.h
      ${CMAKE_SOURCE_DIR}/library/src/rtc_stockham_gen.cpp
@@ -143,6 +147,14 @@
      # bluestein generator code
      ${CMAKE_SOURCE_DIR}/library/src/include/rtc_bluestein_gen.h
      ${CMAKE_SOURCE_DIR}/library/src/rtc_bluestein_gen.cpp
+
+     # twiddle generator code
+     ${CMAKE_SOURCE_DIR}/library/src/include/rtc_twiddle_gen.h
+     ${CMAKE_SOURCE_DIR}/library/src/rtc_twiddle_gen.cpp
+
+     # chirp generator code
+     ${CMAKE_SOURCE_DIR}/library/src/include/rtc_chirp_gen.h
+     ${CMAKE_SOURCE_DIR}/library/src/rtc_chirp_gen.cpp
 )
 
 add_custom_command(
@@ -152,6 +164,25 @@
   DEPENDS ${kgen_embed_command} ${kgen_embed_files} ${kgen_logic_files}
 )
 
+# location of the generated solutions map cpp
+set( gen_solutions ${CMAKE_BINARY_DIR}/library/src/solutions.cpp )
+set( solship_py ${CMAKE_SOURCE_DIR}/library/src/device/solution-shipping.py )
+
+# default folder of solution maps that will be built in library,
+# user can specify their own arch and folder
+set( sol_gpu_arch ${AMDGPU_TARGETS} )
+if( NOT SOLUTION_MAP_DATABASE_FOLDER )
+  set( SOLUTION_MAP_DATABASE_FOLDER ${CMAKE_SOURCE_DIR}/solution_map )
+endif()
+
+add_custom_command(
+  OUTPUT ${gen_solutions}
+  COMMAND ${PYTHON3_EXE} ${solship_py}
+  --gpu-arch="${sol_gpu_arch}"
+  --data-folder=${SOLUTION_MAP_DATABASE_FOLDER}
+  COMMENT "Put solution map from external text file into library"
+)
+
 # The following is a list of implementation files defining the library
 set( rocfft_source
   auxiliary.cpp
@@ -159,6 +190,7 @@
   transform.cpp
   repo.cpp
   powX.cpp
+  chirp.cpp
   twiddles.cpp
   kargs.cpp
   tree_node.cpp
@@ -170,7 +202,10 @@
   fuse_shim.cpp
   assignment_policy.cpp
   node_factory.cpp
+  enum_printer.cpp
   rtc_exports.cpp
+  tuning_kernel_tuner.cpp
+  tuning_plan_tuner.cpp
   )
 
 # SQLite 3.36.0 enabled the backup API by default, which we need
@@ -199,6 +234,7 @@
   )
   FetchContent_MakeAvailable(sqlite_local)
   add_library( sqlite3 OBJECT ${sqlite_local_SOURCE_DIR}/sqlite3.c )
+  target_include_directories( sqlite3 PUBLIC ${sqlite_local_SOURCE_DIR} )
   set_target_properties( sqlite3 PROPERTIES
     C_VISIBILITY_PRESET "hidden"
     VISIBILITY_INLINES_HIDDEN ON
@@ -231,17 +267,23 @@
 add_library( rocfft-rtc-subprocess OBJECT
   rtc_subprocess.cpp
 )
+target_compile_definitions( rocfft-rtc-subprocess PRIVATE
+  -DROCFFT_VERSION=${VERSION_STRING}
+)
 # generation of kernel source
 add_library( rocfft-rtc-gen OBJECT
   rtc_bluestein_gen.cpp
   rtc_realcomplex_gen.cpp
   rtc_stockham_gen.cpp
   rtc_transpose_gen.cpp
+  rtc_twiddle_gen.cpp
+  rtc_chirp_gen.cpp
 )
 # caching of generation/compilation
 add_library( rocfft-rtc-cache OBJECT
   rtc_cache.cpp
 )
+target_link_libraries( rocfft-rtc-cache PUBLIC ${ROCFFT_SQLITE_LIB} )
 # generating kernels from TreeNodes and launching them
 add_library( rocfft-rtc-launch OBJECT
   rtc_kernel.cpp
@@ -249,6 +291,20 @@
   rtc_realcomplex_kernel.cpp
   rtc_stockham_kernel.cpp
   rtc_transpose_kernel.cpp
+  rtc_twiddle_kernel.cpp
+  rtc_chirp_kernel.cpp
+)
+target_link_libraries( rocfft-rtc-launch PRIVATE rocfft-rtc-cache )
+
+# compilation of solution map object and solutions
+add_library( rocfft-solution-map OBJECT
+  solution_map.cpp
+  solutions.cpp
+)
+
+# compilation of tuning helper object
+add_library( rocfft-tuning-helper OBJECT
+  tuning_helper.cpp
 )
 
 foreach( target
@@ -258,12 +314,13 @@
   rocfft-rtc-gen
   rocfft-rtc-cache
   rocfft-rtc-launch
+  rocfft-solution-map
+  rocfft-tuning-helper
   )
   target_include_directories( ${target}
     PRIVATE
     $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/library/src/include>
     $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/library/include>
-    ${sqlite_local_SOURCE_DIR}
   )
   set_target_properties( ${target} PROPERTIES
     CXX_VISIBILITY_PRESET "hidden"
@@ -294,6 +351,23 @@
   rocfft_stub.cpp
 )
 
+if( ROCFFT_BUILD_OFFLINE_TUNER )
+  add_executable( rocfft_offline_tuner
+    ../../shared/array_validator.cpp
+    enum_printer.cpp
+    rocfft_offline_tuner.cpp
+    rocfft_stub.cpp
+  )
+  target_compile_options( rocfft_offline_tuner PRIVATE -DROCFFT_BUILD_OFFLINE_TUNER )
+
+  add_executable( rocfft_solmap_convert
+    enum_printer.cpp
+    rocfft_solmap_convert.cpp
+    rocfft_stub.cpp
+  )
+  target_compile_options( rocfft_solmap_convert PRIVATE -DROCFFT_BUILD_OFFLINE_TUNER )
+endif()
+
 prepend_path( ".." rocfft_headers_public relative_rocfft_headers_public )
 
 add_library( rocfft
@@ -305,6 +379,9 @@
 if( ROCFFT_RUNTIME_COMPILE )
   target_compile_options( rocfft PRIVATE -DROCFFT_RUNTIME_COMPILE )
 endif()
+if( ROCFFT_BUILD_OFFLINE_TUNER )
+  target_compile_options( rocfft PRIVATE -DROCFFT_BUILD_OFFLINE_TUNER )
+endif()
 
 if( NOT BUILD_SHARED_LIBS )
   target_link_libraries( rocfft INTERFACE ${ROCFFT_HOST_LINK_LIBS} )
@@ -325,7 +402,13 @@
 if(TARGET rocfft-device-3)
   target_link_libraries( rocfft PRIVATE rocfft-device-3 )
 endif()
-foreach( target rocfft rocfft_aot_helper rocfft_config_search )
+
+foreach( target rocfft rocfft_offline_tuner rocfft_solmap_convert rocfft_aot_helper rocfft_config_search )
+
+  if(( NOT ROCFFT_BUILD_OFFLINE_TUNER ) AND ((${target} STREQUAL "rocfft_offline_tuner") OR (${target} STREQUAL "rocfft_solmap_convert")))
+    continue()
+  endif()
+
   # RTC uses dladdr to find the RTC helper program
   if( NOT WIN32 )
     target_link_libraries( ${target} PUBLIC -ldl pthread )
@@ -365,18 +448,40 @@
   generator
   rocfft-function-pool
   rocfft-rtc-launch
+  rocfft-solution-map
+  rocfft-tuning-helper
   )
 target_link_libraries( rocfft_config_search PRIVATE
   ${ROCFFT_HOST_LINK_LIBS}
   generator
   rocfft-rtc-launch
-  rocfft-function-pool-standalone
+  rocfft-function-pool
   )
 target_link_libraries( rocfft_aot_helper PRIVATE
   generator
-  rocfft-function-pool-standalone
+  rocfft-function-pool
   )
 
+ # build executable rocfft-offline-tuner
+if( ROCFFT_BUILD_OFFLINE_TUNER )
+  target_link_libraries( rocfft_offline_tuner PRIVATE
+    rocfft
+    generator
+    rocfft-function-pool
+    rocfft-rtc-launch
+    rocfft-solution-map
+    rocfft-tuning-helper
+    )
+  target_link_libraries( rocfft_solmap_convert PRIVATE
+    rocfft
+    generator
+    rocfft-function-pool
+    rocfft-rtc-launch
+    rocfft-solution-map
+    rocfft-tuning-helper
+    )
+endif()
+
 # compile kernels into the cache file we ship
 #
 # cache file should go next to the shared object - on Windows this
@@ -440,19 +545,37 @@
   ${CMAKE_BINARY_DIR}/include
   )
 
-# kernel cache needs to go next to the library - Linux puts shared
+# kernel cache is architecture-dependent data for the library, placed
+# in a rocFFT subdirectory next to the library.  Linux puts shared
 # objects in lib, Windows puts DLLs in bin
 if(WIN32)
-  set(ROCFFT_KERNEL_CACHE_INSTALL_DIR ${CMAKE_INSTALL_BINDIR})
+  set(ROCFFT_KERNEL_CACHE_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}/rocfft)
 else()
-  set(ROCFFT_KERNEL_CACHE_INSTALL_DIR ${ROCM_INSTALL_LIBDIR})
+  set(ROCFFT_KERNEL_CACHE_INSTALL_DIR ${ROCM_INSTALL_LIBDIR}/rocfft)
 endif()
-rocm_install(FILES ${ROCFFT_KERNEL_CACHE_PATH}
-  DESTINATION "${ROCFFT_KERNEL_CACHE_INSTALL_DIR}"
-  COMPONENT runtime
-)
 
-#         PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
+if( NOT ENABLE_ASAN_PACKAGING )
+  rocm_install(FILES ${ROCFFT_KERNEL_CACHE_PATH}
+    DESTINATION "${ROCFFT_KERNEL_CACHE_INSTALL_DIR}"
+    COMPONENT runtime
+  )
+endif()
+
+# rtc helper is an internal library executable on Linux, placed in a
+# rocFFT subdirectory of the library directory.  On Windows it goes
+# into bin next to the library, to simplify finding DLLs.
+if(WIN32)
+  set(ROCFFT_RTC_HELPER_INSTALL_DIR ${CMAKE_INSTALL_BINDIR})
+else()
+  set(ROCFFT_RTC_HELPER_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR}/rocfft/${VERSION_STRING} )
+endif()
+
+if( NOT ENABLE_ASAN_PACKAGING )
+  rocm_install(PROGRAMS $<TARGET_FILE:rocfft_rtc_helper>
+    DESTINATION "${ROCFFT_RTC_HELPER_INSTALL_DIR}"
+    COMPONENT runtime
+  )
+endif()
 
 rocm_export_targets(
   TARGETS roc::rocfft
diff -Nru rocfft-5.5.0/library/src/assignment_policy.cpp rocfft-5.7.1/library/src/assignment_policy.cpp
--- rocfft-5.5.0/library/src/assignment_policy.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/assignment_policy.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -22,7 +22,9 @@
 #include "../../shared/arithmetic.h"
 #include "../../shared/ptrdiff.h"
 #include "./device/kernels/array_format.h"
+#include "enum_printer.h"
 #include "logging.h"
+#include "node_factory.h"
 #include <numeric>
 #include <optional>
 #include <set>
@@ -140,7 +142,7 @@
     {
         --nextExecSeqID;
         auto nextNode = execSeq[nextExecSeqID];
-        if(!nextNode->IsBluesteinChirpSetup())
+        if(!nextNode->IsBluesteinChirpSetup() || execPlan.IsChirpPlan)
         {
             parent->Backtracking(execPlan, nextExecSeqID);
             return;
@@ -201,6 +203,16 @@
     // - they are setting up the chirp buffer
     // - they are the internal steps of multi-kernel bluestein, and
     //   FFT steps may be further decomposed into separate kernels.
+    // - sbrc nodes in multi-kernel fused bluestein that do:
+    //   (1) chirp + padding + forward fft, or
+    //   (2) chirp / input Hadamard product + padding + forward fft
+
+    // First check multi-kernel fused Bluestein implementation
+    if((node.fuseBlue == BFT_FWD_CHIRP || node.fuseBlue == BFT_FWD_CHIRP_MUL)
+       && node.scheme == CS_KERNEL_STOCKHAM_BLOCK_RC)
+    {
+        return true;
+    }
 
     // go up the tree, looking for a bluestein parent node
     for(auto n = &node; n != nullptr; n = n->parent)
@@ -248,9 +260,10 @@
     // the input side of an in-place R2C transform (which the plan
     // would normally call OB_USER_OUT).
     auto dataFits = [&execPlan](const TreeNode& node, OperatingBuffer buffer) {
-        auto nodeLen = node.GetOutputLength();
-        auto bufLen  = buffer == OB_USER_OUT ? execPlan.rootPlan->GetOutputLength()
-                                             : execPlan.rootPlan->length;
+        auto outLengthBlueN = {node.lengthBlueN};
+        auto nodeLen        = (node.fuseBlue == BFT_NONE) ? node.GetOutputLength() : outLengthBlueN;
+        auto bufLen         = buffer == OB_USER_OUT ? execPlan.rootPlan->GetOutputLength()
+                                                    : execPlan.rootPlan->length;
 
         // if node's output is complex and buffer's format is real,
         // adjust output length to be 2x to make the units of
@@ -321,6 +334,18 @@
     {
         test_result = ValidOutBufferBluestein(node);
     }
+    // second node in multi-kernel fused Bluestein must write only to
+    // two specific buffers
+    else if((buffer == OB_USER_OUT || buffer == OB_TEMP_CMPLX_FOR_REAL)
+            && node.fuseBlue == BFT_FWD_CHIRP_MUL)
+    {
+        test_result = false;
+    }
+    // third node in multi-kernel fused Bluestein must not write to OB_TEMP_CMPLX_FOR_REAL
+    else if(buffer == OB_TEMP_CMPLX_FOR_REAL && node.fuseBlue == BFT_INV_CHIRP_MUL)
+    {
+        test_result = false;
+    }
     // if output goes to a temp buffer, that will be dynamically sized
     // to be big enough so it's always ok but if output is in/out, we
     // have to fit into whatever the user gave us
@@ -358,12 +383,14 @@
 bool AssignmentPolicy::CheckAssignmentValid(ExecPlan& execPlan)
 {
     auto getBufSize = [](TreeNode* node, bool input) {
+        auto lengthBlueN = {node->lengthBlueN};
+        auto outputLen   = node->fuseBlue == BFT_NONE ? node->GetOutputLength() : lengthBlueN;
+
         if(input)
             return compute_ptrdiff(node->length, node->inStride, node->batch, node->iDist);
         else
         {
-            return compute_ptrdiff(node->UseOutputLengthForPadding() ? node->GetOutputLength()
-                                                                     : node->length,
+            return compute_ptrdiff(node->UseOutputLengthForPadding() ? outputLen : node->length,
                                    node->outStride,
                                    node->batch,
                                    node->oDist);
@@ -399,7 +426,6 @@
             {
                 if(outfact * curr->inStride[i] != infact * curr->outStride[i])
                 {
-                    // std::cout << "error in stride assignments, re-assign" << std::endl;
                     return false;
                 }
             }
@@ -433,7 +459,7 @@
     if(winnerCandidates.empty())
         return;
 
-    // std::cout << "total candidates: " << winnerCandidates.size() << std::endl;
+    //std::cout << "total candidates: " << winnerCandidates.size() << std::endl;
 
     // sort the candidate, front is the best
     std::sort(
@@ -496,7 +522,77 @@
     return;
 }
 
-bool AssignmentPolicy::AssignBuffers(ExecPlan& execPlan)
+void AssignmentPolicy::FindBluesteinFusedNodes(ExecPlan&               execPlan,
+                                               std::vector<TreeNode*>& fusedNodes)
+{
+    std::vector<TreeNode*> blueNodes;
+    execPlan.rootPlan->RecursiveFindChildNodes(CS_BLUESTEIN, blueNodes);
+    execPlan.rootPlan->AssignParams();
+
+    for(const auto& node : blueNodes)
+        if(node->typeBlue == BT_MULTI_KERNEL_FUSED)
+            fusedNodes.emplace_back(node);
+}
+
+void AssignmentPolicy::AssignChirpBuffers(ExecPlan& execPlan)
+{
+    execPlan.IsChirpPlan = false;
+
+    std::vector<TreeNode*> blueMultiFusedNodes;
+    FindBluesteinFusedNodes(execPlan, blueMultiFusedNodes);
+
+    for(auto& node : blueMultiFusedNodes)
+    {
+        auto& chirpFwdNode            = node->childNodes[0];
+        chirpFwdNode->obIn            = OB_TEMP_BLUESTEIN;
+        chirpFwdNode->inArrayType     = rocfft_array_type_complex_interleaved;
+        chirpFwdNode->obOut           = OB_TEMP_BLUESTEIN;
+        chirpFwdNode->outArrayType    = rocfft_array_type_complex_interleaved;
+        chirpFwdNode->placement       = rocfft_placement_inplace;
+        chirpFwdNode->allowInplace    = true;
+        chirpFwdNode->allowOutofplace = false;
+
+        auto& chirpFwdMulNode            = node->childNodes[1];
+        chirpFwdMulNode->placement       = rocfft_placement_notinplace;
+        chirpFwdMulNode->allowInplace    = true;
+        chirpFwdMulNode->allowOutofplace = true;
+
+        NodeMetaData chirpFwdNodePlan(chirpFwdNode.get());
+        chirpFwdNodePlan.length    = chirpFwdNode->length;
+        chirpFwdNodePlan.dimension = chirpFwdNode->dimension;
+
+        ExecPlan execPlanFwdChirp;
+        execPlanFwdChirp.IsChirpPlan = true;
+        execPlanFwdChirp.rootPlan    = NodeFactory::CreateExplicitNode(chirpFwdNodePlan, nullptr);
+
+        execPlanFwdChirp.rootPlan->RecursiveBuildTree();
+        execPlanFwdChirp.rootPlan->RecursiveCopyNodeData(*chirpFwdNode);
+        execPlanFwdChirp.rootPlan->CollectLeaves(execPlanFwdChirp.execSeq,
+                                                 execPlanFwdChirp.fuseShims);
+
+        execPlanFwdChirp.assignOptStrategy = rocfft_optimize_balance;
+
+        AssignBuffers_internal(execPlanFwdChirp);
+
+        chirpFwdNode->RecursiveCopyNodeData(*execPlanFwdChirp.rootPlan);
+    }
+
+    if(!blueMultiFusedNodes.empty())
+    {
+        execPlan.execSeq.clear();
+        execPlan.fuseShims.clear();
+        execPlan.rootPlan->CollectLeaves(execPlan.execSeq, execPlan.fuseShims);
+    }
+}
+
+void AssignmentPolicy::AssignBuffers(ExecPlan& execPlan)
+{
+    AssignChirpBuffers(execPlan);
+
+    AssignBuffers_internal(execPlan);
+}
+
+void AssignmentPolicy::AssignBuffers_internal(ExecPlan& execPlan)
 {
     int maxFusions      = execPlan.fuseShims.size();
     numCurWinnerFusions = -1; // no winner yet
@@ -534,7 +630,7 @@
 
     // look for nodes that imply presence of other buffers (bluestein)
     RecursiveTraverse(execPlan.rootPlan.get(), [this](TreeNode* n) {
-        if(n->scheme == CS_KERNEL_CHIRP)
+        if(n->scheme == CS_BLUESTEIN)
         {
             availableBuffers.insert(OB_TEMP_BLUESTEIN);
             availableArrayTypes.insert(rocfft_array_type_complex_interleaved);
@@ -552,10 +648,10 @@
     {
         // we already satisfy the strategy, so don't need to go further
         if(execPlan.assignOptStrategy <= rocfft_optimize_min_buffer)
-            return true;
+            return;
         // we already fulfill all possible fusions
         if(numCurWinnerFusions == maxFusions)
-            return true;
+            return;
     }
 
     // if we are here:
@@ -578,10 +674,10 @@
     {
         // we already satisfy the strategy, so don't need to go further
         if(execPlan.assignOptStrategy <= rocfft_optimize_balance)
-            return true;
+            return;
         // we already fulfill all possible fusions
         if(numCurWinnerFusions == maxFusions)
-            return true;
+            return;
     }
 
     // Same as above: if we are here....
@@ -595,11 +691,10 @@
     //   in this ABTC try, winnerCandidates must contain C-buf (mustUseCBuffer=true)
     UpdateWinnerFromValidPaths(execPlan);
     if(numCurWinnerFusions != -1)
-        return true;
+        return;
 
     // else, we can't find any valid buffer assignment !
     throw std::runtime_error("Can't find valid buffer assignment with current buffers.");
-    return false;
 }
 
 void AssignmentPolicy::Enumerate(PlacementTrace*   parent,
@@ -653,20 +748,23 @@
     // auto startBuf  = parent->outBuf;
     // auto startType = parent->oType;
 
-    if(curNode->IsBluesteinChirpSetup())
+    if(!execPlan.IsChirpPlan && curNode->IsBluesteinChirpSetup())
     {
         auto blueNode = curNode;
         // bluestein setup kernels can input/output bluestein buffer only.
         do
         {
-            // chirp setup nodes must use bluestein buffer, not
-            // connected to other nodes, so just set their buffers
-            // directly and don't enumerate them with PlacementTraces
-            blueNode->obIn         = OB_TEMP_BLUESTEIN;
-            blueNode->inArrayType  = rocfft_array_type_complex_interleaved;
-            blueNode->obOut        = OB_TEMP_BLUESTEIN;
-            blueNode->outArrayType = rocfft_array_type_complex_interleaved;
-            blueNode->placement    = rocfft_placement_inplace;
+            if(blueNode->typeBlue != BT_MULTI_KERNEL_FUSED)
+            {
+                // chirp setup nodes must use bluestein buffer, not
+                // connected to other nodes, so just set their buffers
+                // directly and don't enumerate them with PlacementTraces
+                blueNode->obIn         = OB_TEMP_BLUESTEIN;
+                blueNode->inArrayType  = rocfft_array_type_complex_interleaved;
+                blueNode->obOut        = OB_TEMP_BLUESTEIN;
+                blueNode->outArrayType = rocfft_array_type_complex_interleaved;
+                blueNode->placement    = rocfft_placement_inplace;
+            }
 
             ++curSeqID;
             blueNode = execSeq[curSeqID];
@@ -1186,7 +1284,8 @@
             // middle of that.
             for(const auto& u : users)
             {
-                if(u.op == TempBufOp::BufWrite && u.length.size() == 1)
+                if(u.op == TempBufOp::BufWrite && u.node.parent
+                   && u.length.size() > u.node.parent->length.size())
                     return;
             }
 
diff -Nru rocfft-5.5.0/library/src/auxiliary.cpp rocfft-5.7.1/library/src/auxiliary.cpp
--- rocfft-5.5.0/library/src/auxiliary.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/auxiliary.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 /******************************************************************************
-* Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+* Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,12 +21,14 @@
 *******************************************************************************/
 
 #include "../../shared/environment.h"
+#include "../../shared/rocfft_hip.h"
 #include "logging.h"
 #include "repo.h"
 #include "rocfft.h"
-#include "rocfft_hip.h"
 #include "rocfft_ostream.hpp"
 #include "rtc_cache.h"
+#include "solution_map.h"
+#include "tuning_helper.h"
 #include <fcntl.h>
 #include <memory>
 
@@ -39,6 +41,7 @@
 int log_plan_fd     = -1;
 int log_kernelio_fd = -1;
 int log_rtc_fd      = -1;
+int log_tuning_fd   = -1;
 
 /**
  *  @brief Logging function
@@ -115,8 +118,16 @@
         // open log_rtc file
         if(layer_mode & rocfft_layer_mode_log_rtc)
             open_log_stream("ROCFFT_LOG_RTC_PATH", log_rtc_fd);
+
+        // open log_tuning file
+        if(layer_mode & rocfft_layer_mode_log_tuning)
+            open_log_stream("ROCFFT_LOG_TUNING_PATH", log_tuning_fd);
     }
 
+    // setup solution map once in program at the start of library use
+    solution_map::get_solution_map().setup();
+    TuningBenchmarker::GetSingleton().Setup();
+
     log_trace(__func__);
     return rocfft_status_success;
 }
@@ -133,6 +144,8 @@
     RTCCache::single.reset();
 #endif
 
+    TuningBenchmarker::GetSingleton().Clean();
+
     LogSingleton::GetInstance().SetLayerMode(rocfft_layer_mode_none);
     // Close log files
     if(log_trace_fd != -1)
@@ -165,9 +178,23 @@
         CLOSE(log_rtc_fd);
         log_rtc_fd = -1;
     }
+    if(log_tuning_fd != -1)
+    {
+        CLOSE(log_tuning_fd);
+        log_tuning_fd = -1;
+    }
 
     // stop all log worker threads
     rocfft_ostream::cleanup();
 
     return rocfft_status_success;
 }
+
+#ifdef ROCFFT_BUILD_OFFLINE_TUNER
+rocfft_status rocfft_get_offline_tuner_handle(void** offline_tuner)
+{
+    TuningBenchmarker::GetSingleton().SetBindingSolutionMap(&solution_map::get_solution_map());
+    *offline_tuner = &(TuningBenchmarker::GetSingleton());
+    return rocfft_status_success;
+}
+#endif
\ No newline at end of file
diff -Nru rocfft-5.5.0/library/src/chirp.cpp rocfft-5.7.1/library/src/chirp.cpp
--- rocfft-5.5.0/library/src/chirp.cpp	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/chirp.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,114 @@
+/******************************************************************************
+* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*******************************************************************************/
+
+#include "chirp.h"
+#include "../../shared/arithmetic.h"
+#include "../../shared/hipstream_wrapper.h"
+#include "../../shared/rocfft_complex.h"
+#include "../../shared/rocfft_hip.h"
+#include "rtc_cache.h"
+#include "rtc_chirp_kernel.h"
+#include "rtc_kernel.h"
+#include <cassert>
+#include <iostream>
+#include <math.h>
+#include <numeric>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+
+// this vector stores chirp for each device id.  index in the
+// vector is device id.  note that this vector needs to be protected
+// against concurrent access, but chirp are always accessed
+// through the Repo which guarantees exclusive access.
+static std::vector<hipStream_wrapper_t> chirp_streams;
+
+void chirp_streams_cleanup()
+{
+    chirp_streams.clear();
+}
+
+template <typename Tcomplex>
+void launch_chirp_kernel(const size_t     N,
+                         rocfft_precision precision,
+                         const char*      gpu_arch,
+                         hipStream_t&     stream,
+                         Tcomplex*        output)
+{
+    auto blockSize = CHIRP_THREADS;
+    auto numBlocks = DivRoundingUp<size_t>(N, blockSize);
+
+    auto          kernel = RTCKernelChirp::generate(gpu_arch, precision);
+    RTCKernelArgs kargs;
+    kargs.append_size_t(N);
+    kargs.append_ptr(output);
+    kernel.launch(kargs, dim3(numBlocks), dim3(blockSize), 0, stream);
+}
+
+template <typename Tcomplex>
+gpubuf chirp_create_pr(size_t           N,
+                       rocfft_precision precision,
+                       const char*      gpu_arch,
+                       unsigned int     deviceId)
+{
+    gpubuf chirp;
+
+    auto chirp_bytes = N * sizeof(Tcomplex);
+
+    if(chirp.alloc(chirp_bytes) != hipSuccess)
+        throw std::runtime_error("unable to allocate chirp length " + std::to_string(N));
+
+    if(deviceId >= chirp_streams.size())
+        chirp_streams.resize(deviceId + 1);
+    if(chirp_streams[deviceId] == nullptr)
+        chirp_streams[deviceId].alloc();
+    hipStream_t stream = chirp_streams[deviceId];
+
+    if(stream == nullptr)
+    {
+        if(hipStreamCreate(&stream) != hipSuccess)
+            throw std::runtime_error("hipStreamCreate failure");
+    }
+
+    auto device_chirp_ptr = static_cast<Tcomplex*>(chirp.data());
+
+    launch_chirp_kernel(N, precision, gpu_arch, stream, device_chirp_ptr);
+
+    if(hipStreamSynchronize(stream) != hipSuccess)
+        throw std::runtime_error("hipStream failure");
+
+    return chirp;
+}
+
+gpubuf
+    chirp_create(size_t N, rocfft_precision precision, const char* gpu_arch, unsigned int deviceId)
+{
+    switch(precision)
+    {
+    case rocfft_precision_single:
+        return chirp_create_pr<rocfft_complex<float>>(N, precision, gpu_arch, deviceId);
+    case rocfft_precision_double:
+        return chirp_create_pr<rocfft_complex<double>>(N, precision, gpu_arch, deviceId);
+    case rocfft_precision_half:
+        return chirp_create_pr<rocfft_complex<_Float16>>(N, precision, gpu_arch, deviceId);
+    }
+}
diff -Nru rocfft-5.5.0/library/src/compute_scheme.cpp rocfft-5.7.1/library/src/compute_scheme.cpp
--- rocfft-5.5.0/library/src/compute_scheme.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/compute_scheme.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -21,10 +21,13 @@
 #include "compute_scheme.h"
 
 #include <map>
+#include <set>
+#include <stdexcept>
 
 #define TO_STR2(x) #x
 #define TO_STR(x) TO_STR2(x)
 #define ENUMSTR(x) x, TO_STR(x)
+#define STRENUM(x) TO_STR(x), x
 
 static const std::map<ComputeScheme, const char*>& ComputeSchemetoStringMap()
 {
@@ -89,3 +92,76 @@
 {
     return ComputeSchemetoStringMap().at(cs);
 }
+
+static std::map<std::string, ComputeScheme> StrToComputeSchemeMap()
+{
+    std::map<std::string, ComputeScheme> String2ComputeScheme;
+    for(auto i : ComputeSchemetoStringMap())
+        String2ComputeScheme.emplace(i.second, i.first);
+    return String2ComputeScheme;
+}
+
+ComputeScheme StrToComputeScheme(const std::string& str)
+{
+    static auto csmap = StrToComputeSchemeMap();
+    return csmap.at(str);
+}
+
+// schemes that could be a root problem, not a kernel
+// TODO- It would be better to refactor the ComputeScheme, might be good to define
+// things like PROB_DESC (such as 3D_C2C, LARGE_1D_C2C) , ALGORITHM (STOCKHAM...),
+// DECOMPOSITION (2D_RTRT, 1D_CC, 1D_TRTRT)...
+static const std::set<ComputeScheme>& ProblemScheme()
+{
+    static const std::set<ComputeScheme> ProblemSchemeSet = {(CS_KERNEL_STOCKHAM),
+                                                             (CS_REAL_TRANSFORM_USING_CMPLX),
+                                                             (CS_REAL_TRANSFORM_EVEN),
+                                                             (CS_REAL_2D_EVEN),
+                                                             (CS_REAL_3D_EVEN),
+                                                             (CS_BLUESTEIN),
+                                                             (CS_L1D_TRTRT),
+                                                             (CS_L1D_CC),
+                                                             (CS_L1D_CRT),
+                                                             (CS_2D_RTRT),
+                                                             (CS_2D_RC),
+                                                             (CS_KERNEL_2D_SINGLE),
+                                                             (CS_3D_TRTRTR),
+                                                             (CS_3D_RTRT),
+                                                             (CS_3D_BLOCK_RC),
+                                                             (CS_3D_BLOCK_CR),
+                                                             (CS_3D_RC)};
+
+    return ProblemSchemeSet;
+}
+
+bool ComputeSchemeIsAProblem(ComputeScheme cs)
+{
+    return ProblemScheme().count(cs) != 0;
+}
+
+std::string PrintKernelSchemeAbbr(ComputeScheme cs)
+{
+    switch(cs)
+    {
+    case CS_KERNEL_STOCKHAM:
+        return "sbrr";
+    case CS_KERNEL_STOCKHAM_BLOCK_CC:
+        return "sbcc";
+    case CS_KERNEL_STOCKHAM_BLOCK_CR:
+        return "sbcr";
+    case CS_KERNEL_STOCKHAM_BLOCK_RC:
+        return "sbrc";
+    case CS_KERNEL_2D_SINGLE:
+        return "2d_single";
+    case CS_KERNEL_STOCKHAM_TRANSPOSE_XY_Z:
+        return "sbrc_xy_z";
+    case CS_KERNEL_STOCKHAM_TRANSPOSE_Z_XY:
+        return "sbrc_z_xy";
+    case CS_KERNEL_STOCKHAM_R_TO_CMPLX_TRANSPOSE_Z_XY:
+        return "sbrc_erc_z_xy";
+    default:
+        throw std::runtime_error("unsupported scheme in PrintKernelSchemeAbbr");
+    }
+
+    return "";
+}
diff -Nru rocfft-5.5.0/library/src/device/CMakeLists.txt rocfft-5.7.1/library/src/device/CMakeLists.txt
--- rocfft-5.5.0/library/src/device/CMakeLists.txt	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/CMakeLists.txt	2023-08-09 16:19:51.000000000 +0000
@@ -158,20 +158,12 @@
 
 # function pool is a generated file, but put it in its own
 # library so it's easier to link to.
-#
-# build function pool into two separate libraries - one that has
-# functions callable by rocFFT and depends on amdhip64, and another
-# one usable by AOT RTC that contains no device code
 list( FILTER rocfft_device_source EXCLUDE REGEX function_pool.cpp )
 add_library( rocfft-function-pool OBJECT
   function_pool.cpp
 )
-target_compile_definitions( rocfft-function-pool PRIVATE FUNCTION_POOL_STANDALONE_BODY= )
-add_library( rocfft-function-pool-standalone OBJECT
-  function_pool.cpp
-)
-target_compile_definitions( rocfft-function-pool-standalone PRIVATE FUNCTION_POOL_STANDALONE_BODY={} )
-foreach( pool rocfft-function-pool rocfft-function-pool-standalone )
+
+foreach( pool rocfft-function-pool )
   target_include_directories( ${pool}
     PRIVATE
     $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/library/src/device>
@@ -179,6 +171,8 @@
     $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
   )
   set_target_properties( ${pool} PROPERTIES
+    CXX_VISIBILITY_PRESET "hidden"
+    VISIBILITY_INLINES_HIDDEN ON
     CXX_STANDARD 17
     CXX_STANDARD_REQUIRED ON
     POSITION_INDEPENDENT_CODE ON
diff -Nru rocfft-5.5.0/library/src/device/generator/bluestein_generator.h rocfft-5.7.1/library/src/device/generator/bluestein_generator.h
--- rocfft-5.5.0/library/src/device/generator/bluestein_generator.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/device/generator/bluestein_generator.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,1669 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+
+#include "generator.h"
+
+enum BluesteinOperationType
+{
+    BFN_LOAD_CC_FWD_CHIRP,
+    BFN_LOAD_RC_FWD_CHIRP,
+    BFN_LOAD_CC_FWD_CHIRP_MUL,
+    BFN_LOAD_RC_FWD_CHIRP_MUL,
+    BFN_LOAD_CC_INV_CHIRP_MUL,
+    BFN_LOAD_RC_INV_CHIRP_MUL,
+    BFN_STORE_CC_FWD_CHIRP,
+    BFN_STORE_RC_FWD_CHIRP,
+    BFN_STORE_CC_FWD_CHIRP_MUL,
+    BFN_STORE_RC_FWD_CHIRP_MUL,
+    BFN_STORE_CC_INV_CHIRP_MUL,
+    BFN_STORE_RC_INV_CHIRP_MUL,
+};
+
+struct BluesteinData
+{
+public:
+    BluesteinData() {}
+
+    //
+    // templates
+    //
+    Variable scalar_type{"scalar_type", "typename"};
+    Variable callback_type{"cbtype", "CallbackType"};
+
+    //
+    // internal variables
+    //
+    Variable chirp{"chirp", "const scalar_type* __restrict__ "};
+    Variable data_idx{"data_idx", "size_t"};
+    Variable data_voffset{"data_voffset", "size_t"};
+    Variable data_soffset{"data_soffset", "size_t"};
+    Variable data_rw_flag{"data_rw_flag", "bool"};
+    Variable buf_in{"buf_in", "scalar_type*"};
+    Variable buf_inre{"buf_inre", "real_type_t<scalar_type>* __restrict__ "};
+    Variable buf_inim{"buf_inim", "real_type_t<scalar_type>* __restrict__ "};
+    Variable buf_out{"buf_out", "scalar_type*"};
+    Variable buf_outre{"buf_outre", "real_type_t<scalar_type>* __restrict__ "};
+    Variable buf_outim{"buf_outim", "real_type_t<scalar_type>* __restrict__ "};
+    Variable data_buf{"data_buf", "scalar_type*"};
+    Variable data_bufre{"data_bufre", "real_type_t<scalar_type>* __restrict__ "};
+    Variable data_bufim{"data_bufim", "real_type_t<scalar_type>* __restrict__ "};
+    Variable data_elem{"data_elem", "scalar_type"};
+    Variable data_elemre{"data_elemre", "real_type_t<scalar_type> __restrict__ "};
+    Variable data_elemim{"data_elemim", "real_type_t<scalar_type> __restrict__ "};
+    Variable length_N_blue{"length_N_blue", "const size_t"};
+    Variable length_M_blue{"length_M_blue", "const size_t"};
+    Variable global_stride_in_0{"global_stride_in_0", "const size_t"};
+    Variable global_stride_in_1{"global_stride_in_1", "const size_t"};
+    Variable global_idist{"global_idist", "const size_t"};
+    Variable global_stride_out_0{"global_stride_out_0", "const size_t"};
+    Variable global_stride_out_1{"global_stride_out_1", "const size_t"};
+    Variable global_odist{"global_odist", "const size_t"};
+    Variable transform_idx{"transform_idx", "const size_t"};
+
+    //
+    // variables borrowed from stockham_gen_base.h
+    //
+    Variable global_data_id{"global_data_id", "size_t"};
+    Variable global_transf_id{"global_transf_id", "size_t"};
+    Variable load_cb_data{"load_cb_data", "void*"};
+    Variable load_cb_fn{"load_cb_fn", "void", true, true};
+    Variable store_cb_data{"store_cb_data", "void*"};
+    Variable store_cb_fn{"store_cb_fn", "void", true, true};
+};
+
+class BluesteinFunction
+{
+public:
+    Expression get_load_op(const BluesteinOperationType& type, const LoadGlobal& x, bool planar)
+    {
+        return get_load_op(type, x.args[1], planar);
+    }
+
+    Expression
+        get_load_op(const BluesteinOperationType& type, const LoadGlobalPlanar& x, bool planar)
+    {
+        return get_load_op(type, x.args[2], planar);
+    }
+
+    Expression get_load_op(const BluesteinOperationType& type, const IntrinsicLoad& x, bool planar)
+    {
+        return get_intrinsic_load_op(type, x.args[1], x.args[2], x.args[3], planar);
+    }
+
+    Expression
+        get_load_op(const BluesteinOperationType& type, const IntrinsicLoadPlanar& x, bool planar)
+    {
+        return get_intrinsic_load_op(type, x.args[2], x.args[3], x.args[4], planar);
+    }
+
+    Statement get_store_op(const BluesteinOperationType& type, const StoreGlobal& x, bool planar)
+    {
+        return get_store_op(type, x.index, x.value, planar);
+    }
+
+    Statement
+        get_store_op(const BluesteinOperationType& type, const StoreGlobalPlanar& x, bool planar)
+    {
+        return get_store_op(type, x.index, x.value, planar);
+    }
+
+    Statement get_store_op(const BluesteinOperationType& type, const IntrinsicStore& x, bool planar)
+    {
+        return get_intrinsic_store_op(type, x.voffset, x.soffset, x.rw_flag, x.value, planar);
+    }
+
+    Statement
+        get_store_op(const BluesteinOperationType& type, const IntrinsicStorePlanar& x, bool planar)
+    {
+        return get_intrinsic_store_op(type, x.voffset, x.soffset, x.rw_flag, x.value, planar);
+    }
+
+    std::string get_op_name(const BluesteinOperationType& type)
+    {
+        switch(type)
+        {
+        case BFN_LOAD_CC_FWD_CHIRP:
+            return function_name[BFN_LOAD_CC_FWD_CHIRP];
+        case BFN_LOAD_RC_FWD_CHIRP:
+            return function_name[BFN_LOAD_RC_FWD_CHIRP];
+        case BFN_LOAD_CC_FWD_CHIRP_MUL:
+            return function_name[BFN_LOAD_CC_FWD_CHIRP_MUL];
+        case BFN_LOAD_RC_FWD_CHIRP_MUL:
+            return function_name[BFN_LOAD_RC_FWD_CHIRP_MUL];
+        case BFN_LOAD_CC_INV_CHIRP_MUL:
+            return function_name[BFN_LOAD_CC_INV_CHIRP_MUL];
+        case BFN_LOAD_RC_INV_CHIRP_MUL:
+            return function_name[BFN_LOAD_RC_INV_CHIRP_MUL];
+        case BFN_STORE_CC_FWD_CHIRP:
+            return function_name[BFN_STORE_CC_FWD_CHIRP];
+        case BFN_STORE_RC_FWD_CHIRP:
+            return function_name[BFN_STORE_RC_FWD_CHIRP];
+        case BFN_STORE_CC_FWD_CHIRP_MUL:
+            return function_name[BFN_STORE_CC_FWD_CHIRP_MUL];
+        case BFN_STORE_RC_FWD_CHIRP_MUL:
+            return function_name[BFN_STORE_RC_FWD_CHIRP_MUL];
+        case BFN_STORE_CC_INV_CHIRP_MUL:
+            return function_name[BFN_STORE_CC_INV_CHIRP_MUL];
+        case BFN_STORE_RC_INV_CHIRP_MUL:
+            return function_name[BFN_STORE_RC_INV_CHIRP_MUL];
+        }
+    }
+
+    std::string get_intrinsic_op_name(const BluesteinOperationType& type)
+    {
+        switch(type)
+        {
+        case BFN_LOAD_CC_FWD_CHIRP:
+            return intrinsic_function_name[BFN_LOAD_CC_FWD_CHIRP];
+        case BFN_LOAD_RC_FWD_CHIRP:
+            return intrinsic_function_name[BFN_LOAD_RC_FWD_CHIRP];
+        case BFN_LOAD_CC_FWD_CHIRP_MUL:
+            return intrinsic_function_name[BFN_LOAD_CC_FWD_CHIRP_MUL];
+        case BFN_LOAD_RC_FWD_CHIRP_MUL:
+            return intrinsic_function_name[BFN_LOAD_RC_FWD_CHIRP_MUL];
+        case BFN_LOAD_CC_INV_CHIRP_MUL:
+            return intrinsic_function_name[BFN_LOAD_CC_INV_CHIRP_MUL];
+        case BFN_LOAD_RC_INV_CHIRP_MUL:
+            return intrinsic_function_name[BFN_LOAD_RC_INV_CHIRP_MUL];
+        case BFN_STORE_CC_FWD_CHIRP:
+            return intrinsic_function_name[BFN_STORE_CC_FWD_CHIRP];
+        case BFN_STORE_RC_FWD_CHIRP:
+            return intrinsic_function_name[BFN_STORE_RC_FWD_CHIRP];
+        case BFN_STORE_CC_FWD_CHIRP_MUL:
+            return intrinsic_function_name[BFN_STORE_CC_FWD_CHIRP_MUL];
+        case BFN_STORE_RC_FWD_CHIRP_MUL:
+            return intrinsic_function_name[BFN_STORE_RC_FWD_CHIRP_MUL];
+        case BFN_STORE_CC_INV_CHIRP_MUL:
+            return intrinsic_function_name[BFN_STORE_CC_INV_CHIRP_MUL];
+        case BFN_STORE_RC_INV_CHIRP_MUL:
+            return intrinsic_function_name[BFN_STORE_RC_INV_CHIRP_MUL];
+        }
+    }
+
+private:
+    Expression get_load_op(const BluesteinOperationType& type, const Expression& index, bool planar)
+    {
+        std::unique_ptr<Expression> op;
+
+        switch(type)
+        {
+        case BFN_LOAD_CC_FWD_CHIRP:
+        {
+            op = std::make_unique<Expression>(CallExpr{
+                get_op_name(BFN_LOAD_CC_FWD_CHIRP) + render_template(),
+                {data.chirp, data.global_transf_id, data.length_N_blue, data.length_M_blue}});
+            break;
+        }
+        case BFN_LOAD_RC_FWD_CHIRP:
+        {
+            op = std::make_unique<Expression>(
+                planar
+                    ? CallExpr{get_op_name(BFN_LOAD_RC_FWD_CHIRP) + render_template(),
+                               {data.global_transf_id,
+                                data.buf_inre,
+                                data.buf_inim,
+                                data.load_cb_fn,
+                                data.load_cb_data}}
+                    : CallExpr{
+                        get_op_name(BFN_LOAD_RC_FWD_CHIRP) + render_template(),
+                        {data.global_transf_id, data.buf_in, data.load_cb_fn, data.load_cb_data}});
+            break;
+        }
+        case BFN_LOAD_CC_FWD_CHIRP_MUL:
+        {
+            op = std::make_unique<Expression>(
+                planar ? CallExpr{get_op_name(BFN_LOAD_CC_FWD_CHIRP_MUL) + render_template(),
+                                  {data.chirp,
+                                   data.global_transf_id,
+                                   index,
+                                   data.length_N_blue,
+                                   data.buf_inre,
+                                   data.buf_inim,
+                                   data.load_cb_fn,
+                                   data.load_cb_data}}
+                       : CallExpr{get_op_name(BFN_LOAD_CC_FWD_CHIRP_MUL) + render_template(),
+                                  {data.chirp,
+                                   data.global_transf_id,
+                                   index,
+                                   data.length_N_blue,
+                                   data.buf_in,
+                                   data.load_cb_fn,
+                                   data.load_cb_data}});
+            break;
+        }
+        case BFN_LOAD_RC_FWD_CHIRP_MUL:
+        {
+            op = std::make_unique<Expression>(
+                planar ? CallExpr{get_op_name(BFN_LOAD_RC_FWD_CHIRP_MUL) + render_template(),
+                                  {data.global_data_id,
+                                   data.buf_inre,
+                                   data.buf_inim,
+                                   data.load_cb_fn,
+                                   data.load_cb_data}}
+                       : CallExpr{
+                           get_op_name(BFN_LOAD_RC_FWD_CHIRP_MUL) + render_template(),
+                           {data.global_data_id, data.buf_in, data.load_cb_fn, data.load_cb_data}});
+            break;
+        }
+        case BFN_LOAD_CC_INV_CHIRP_MUL:
+        {
+            op = std::make_unique<Expression>(
+                planar ? CallExpr{get_op_name(BFN_LOAD_CC_INV_CHIRP_MUL) + render_template(),
+                                  {data.global_transf_id,
+                                   data.global_data_id,
+                                   data.length_M_blue,
+                                   data.buf_inre,
+                                   data.buf_inim,
+                                   data.load_cb_fn,
+                                   data.load_cb_data}}
+                       : CallExpr{get_op_name(BFN_LOAD_CC_INV_CHIRP_MUL) + render_template(),
+                                  {data.global_transf_id,
+                                   data.global_data_id,
+                                   data.length_M_blue,
+                                   data.buf_in,
+                                   data.load_cb_fn,
+                                   data.load_cb_data}});
+            break;
+        }
+        case BFN_LOAD_RC_INV_CHIRP_MUL:
+        {
+            op = std::make_unique<Expression>(
+                planar ? CallExpr{get_op_name(BFN_LOAD_RC_INV_CHIRP_MUL) + render_template(),
+                                  {data.global_data_id,
+                                   data.buf_inre,
+                                   data.buf_inim,
+                                   data.load_cb_fn,
+                                   data.load_cb_data}}
+                       : CallExpr{
+                           get_op_name(BFN_LOAD_RC_INV_CHIRP_MUL) + render_template(),
+                           {data.global_data_id, data.buf_in, data.load_cb_fn, data.load_cb_data}});
+            break;
+        }
+        default:
+            throw std::runtime_error("unsupported bluestein fuse operation");
+        }
+
+        return *op;
+    }
+
+    Expression get_intrinsic_load_op(const BluesteinOperationType& type,
+                                     const Expression&             voffset,
+                                     const Expression&             soffset,
+                                     const Expression&             rw_flag,
+                                     bool                          planar)
+    {
+        std::unique_ptr<Expression> op;
+
+        switch(type)
+        {
+        case BFN_LOAD_CC_FWD_CHIRP:
+        {
+            op = std::make_unique<Expression>(
+                CallExpr{get_intrinsic_op_name(BFN_LOAD_CC_FWD_CHIRP) + render_template(),
+                         {data.chirp,
+                          data.global_transf_id,
+                          rw_flag,
+                          data.length_N_blue,
+                          data.length_M_blue}});
+            break;
+        }
+        case BFN_LOAD_CC_FWD_CHIRP_MUL:
+        {
+            op = std::make_unique<Expression>(
+                planar
+                    ? CallExpr{get_intrinsic_op_name(BFN_LOAD_CC_FWD_CHIRP_MUL) + render_template(),
+                               {data.chirp,
+                                data.global_transf_id,
+                                voffset,
+                                soffset,
+                                rw_flag,
+                                data.length_N_blue,
+                                data.buf_inre,
+                                data.buf_inim,
+                                data.load_cb_fn,
+                                data.load_cb_data}}
+                    : CallExpr{get_intrinsic_op_name(BFN_LOAD_CC_FWD_CHIRP_MUL) + render_template(),
+                               {data.chirp,
+                                data.global_transf_id,
+                                voffset,
+                                soffset,
+                                rw_flag,
+                                data.length_N_blue,
+                                data.buf_in,
+                                data.load_cb_fn,
+                                data.load_cb_data}});
+            break;
+        }
+        case BFN_LOAD_CC_INV_CHIRP_MUL:
+        {
+            op = std::make_unique<Expression>(
+                planar
+                    ? CallExpr{get_intrinsic_op_name(BFN_LOAD_CC_INV_CHIRP_MUL) + render_template(),
+                               {data.global_transf_id,
+                                data.global_data_id,
+                                Literal{"0"},
+                                rw_flag,
+                                data.length_M_blue,
+                                data.buf_inre,
+                                data.buf_inim,
+                                data.load_cb_fn,
+                                data.load_cb_data}}
+                    : CallExpr{get_intrinsic_op_name(BFN_LOAD_CC_INV_CHIRP_MUL) + render_template(),
+                               {data.global_transf_id,
+                                data.global_data_id,
+                                Literal{"0"},
+                                rw_flag,
+                                data.length_M_blue,
+                                data.buf_in,
+                                data.load_cb_fn,
+                                data.load_cb_data}});
+            break;
+        }
+        default:
+            throw std::runtime_error("unsupported bluestein fuse operation");
+        }
+
+        return *op;
+    }
+
+    Statement get_store_op(const BluesteinOperationType& type,
+                           const Expression&             index,
+                           const Expression&             value,
+                           bool                          planar)
+    {
+        std::unique_ptr<Statement> op;
+
+        switch(type)
+        {
+        case BFN_STORE_CC_FWD_CHIRP:
+        {
+            op = std::make_unique<Statement>(
+                planar ? Call{get_op_name(BFN_STORE_CC_FWD_CHIRP) + render_template(),
+                              {data.global_transf_id,
+                               data.buf_outre,
+                               data.buf_outim,
+                               value,
+                               data.store_cb_fn,
+                               data.store_cb_data}}
+                       : Call{get_op_name(BFN_STORE_CC_FWD_CHIRP) + render_template(),
+                              {data.global_transf_id,
+                               data.buf_out,
+                               value,
+                               data.store_cb_fn,
+                               data.store_cb_data}});
+            break;
+        }
+        case BFN_STORE_RC_FWD_CHIRP:
+        {
+            op = std::make_unique<Statement>(
+                planar ? Call{get_op_name(BFN_STORE_RC_FWD_CHIRP) + render_template(),
+                              {data.global_transf_id,
+                               data.buf_outre,
+                               data.buf_outim,
+                               value,
+                               data.store_cb_fn,
+                               data.store_cb_data}}
+                       : Call{get_op_name(BFN_STORE_RC_FWD_CHIRP) + render_template(),
+                              {data.global_transf_id,
+                               data.buf_out,
+                               value,
+                               data.store_cb_fn,
+                               data.store_cb_data}});
+            break;
+        }
+        case BFN_STORE_CC_FWD_CHIRP_MUL:
+        {
+            op = std::make_unique<Statement>(
+                planar ? Call{get_op_name(BFN_STORE_CC_FWD_CHIRP_MUL) + render_template(),
+                              {data.global_data_id,
+                               data.buf_outre,
+                               data.buf_outim,
+                               value,
+                               data.store_cb_fn,
+                               data.store_cb_data}}
+                       : Call{get_op_name(BFN_STORE_CC_FWD_CHIRP_MUL) + render_template(),
+                              {data.global_data_id,
+                               data.buf_out,
+                               value,
+                               data.store_cb_fn,
+                               data.store_cb_data}});
+            break;
+        }
+        case BFN_STORE_RC_FWD_CHIRP_MUL:
+        {
+            op = std::make_unique<Statement>(
+                planar ? Call{get_op_name(BFN_STORE_RC_FWD_CHIRP_MUL) + render_template(),
+                              {data.global_data_id,
+                               data.length_M_blue,
+                               data.buf_outre,
+                               data.buf_outim,
+                               value,
+                               data.store_cb_fn,
+                               data.store_cb_data}}
+                       : Call{get_op_name(BFN_STORE_RC_FWD_CHIRP_MUL) + render_template(),
+                              {data.global_data_id,
+                               data.length_M_blue,
+                               data.buf_out,
+                               value,
+                               data.store_cb_fn,
+                               data.store_cb_data}});
+            break;
+        }
+        case BFN_STORE_CC_INV_CHIRP_MUL:
+        {
+            op = std::make_unique<Statement>(
+                planar ? Call{get_op_name(BFN_STORE_CC_INV_CHIRP_MUL) + render_template(),
+                              {data.global_data_id,
+                               data.buf_outre,
+                               data.buf_outim,
+                               value,
+                               data.store_cb_fn,
+                               data.store_cb_data}}
+                       : Call{get_op_name(BFN_STORE_CC_INV_CHIRP_MUL) + render_template(),
+                              {data.global_data_id,
+                               data.buf_out,
+                               value,
+                               data.store_cb_fn,
+                               data.store_cb_data}});
+            break;
+        }
+        case BFN_STORE_RC_INV_CHIRP_MUL:
+        {
+            op = std::make_unique<Statement>(
+                planar ? Call{get_op_name(BFN_STORE_RC_INV_CHIRP_MUL) + render_template(),
+                              {data.chirp,
+                               data.global_transf_id,
+                               index,
+                               data.length_N_blue,
+                               data.length_M_blue,
+                               data.buf_outre,
+                               data.buf_outim,
+                               value,
+                               data.store_cb_fn,
+                               data.store_cb_data}}
+                       : Call{get_op_name(BFN_STORE_RC_INV_CHIRP_MUL) + render_template(),
+                              {data.chirp,
+                               data.global_transf_id,
+                               index,
+                               data.length_N_blue,
+                               data.length_M_blue,
+                               data.buf_out,
+                               value,
+                               data.store_cb_fn,
+                               data.store_cb_data}});
+            break;
+        }
+        default:
+            throw std::runtime_error("unsupported bluestein fuse operation");
+        }
+
+        return *op;
+    }
+
+    Statement get_intrinsic_store_op(const BluesteinOperationType& type,
+                                     const Expression&             voffset,
+                                     const Expression&             soffset,
+                                     const Expression&             rw_flag,
+                                     const Expression&             value,
+                                     bool                          planar)
+    {
+        std::unique_ptr<Statement> op;
+
+        switch(type)
+        {
+        case BFN_STORE_CC_FWD_CHIRP:
+        {
+            op = std::make_unique<Statement>(
+                planar ? Call{get_intrinsic_op_name(BFN_STORE_CC_FWD_CHIRP) + render_template(),
+                              {data.global_transf_id,
+                               Literal{"0"},
+                               rw_flag,
+                               data.buf_outre,
+                               data.buf_outim,
+                               value,
+                               data.store_cb_fn,
+                               data.store_cb_data}}
+                       : Call{get_intrinsic_op_name(BFN_STORE_CC_FWD_CHIRP) + render_template(),
+                              {data.global_transf_id,
+                               Literal{"0"},
+                               rw_flag,
+                               data.buf_out,
+                               value,
+                               data.store_cb_fn,
+                               data.store_cb_data}});
+            break;
+        }
+        case BFN_STORE_CC_FWD_CHIRP_MUL:
+        {
+            op = std::make_unique<Statement>(
+                planar ? Call{get_intrinsic_op_name(BFN_STORE_CC_FWD_CHIRP_MUL) + render_template(),
+                              {data.global_data_id,
+                               Literal{"0"},
+                               rw_flag,
+                               data.buf_outre,
+                               data.buf_outim,
+                               value,
+                               data.store_cb_fn,
+                               data.store_cb_data}}
+                       : Call{get_intrinsic_op_name(BFN_STORE_CC_FWD_CHIRP_MUL) + render_template(),
+                              {data.global_data_id,
+                               Literal{"0"},
+                               rw_flag,
+                               data.buf_out,
+                               value,
+                               data.store_cb_fn,
+                               data.store_cb_data}});
+            break;
+        }
+        case BFN_STORE_CC_INV_CHIRP_MUL:
+        {
+            op = std::make_unique<Statement>(
+                planar ? Call{get_intrinsic_op_name(BFN_STORE_CC_INV_CHIRP_MUL) + render_template(),
+                              {data.global_data_id,
+                               Literal{"0"},
+                               rw_flag,
+                               data.buf_outre,
+                               data.buf_outim,
+                               value,
+                               data.store_cb_fn,
+                               data.store_cb_data}}
+                       : Call{get_intrinsic_op_name(BFN_STORE_CC_INV_CHIRP_MUL) + render_template(),
+                              {data.global_data_id,
+                               Literal{"0"},
+                               rw_flag,
+                               data.buf_out,
+                               value,
+                               data.store_cb_fn,
+                               data.store_cb_data}});
+            break;
+        }
+        default:
+            throw std::runtime_error("unsupported bluestein fuse operation");
+        }
+
+        return *op;
+    }
+
+    std::string render_template()
+    {
+        return "<" + data.scalar_type.render() + ", " + data.callback_type.render() + ">";
+    }
+
+    const std::vector<std::string> function_name = {"bluestein_load_cc_fwd_chirp_device",
+                                                    "bluestein_load_rc_fwd_chirp_device",
+                                                    "bluestein_load_cc_fwd_chirp_mul_device",
+                                                    "bluestein_load_rc_fwd_chirp_mul_device",
+                                                    "bluestein_load_cc_inv_chirp_mul_device",
+                                                    "bluestein_load_rc_inv_chirp_mul_device",
+                                                    "bluestein_store_cc_fwd_chirp_device",
+                                                    "bluestein_store_rc_fwd_chirp_device",
+                                                    "bluestein_store_cc_fwd_chirp_mul_device",
+                                                    "bluestein_store_rc_fwd_chirp_mul_device",
+                                                    "bluestein_store_cc_inv_chirp_mul_device",
+                                                    "bluestein_store_rc_inv_chirp_mul_device"};
+
+    const std::vector<std::string> intrinsic_function_name
+        = {"bluestein_intrinsic_load_cc_fwd_chirp_device",
+           "bluestein_intrinsic_load_rc_fwd_chirp_device",
+           "bluestein_intrinsic_load_cc_fwd_chirp_mul_device",
+           "bluestein_intrinsic_load_rc_fwd_chirp_mul_device",
+           "bluestein_intrinsic_load_cc_inv_chirp_mul_device",
+           "bluestein_intrinsic_load_rc_inv_chirp_mul_device",
+           "bluestein_intrinsic_store_cc_fwd_chirp_device",
+           "bluestein_intrinsic_store_rc_fwd_chirp_device",
+           "bluestein_intrinsic_store_cc_fwd_chirp_mul_device",
+           "bluestein_intrinsic_store_rc_fwd_chirp_mul_device",
+           "bluestein_intrinsic_store_cc_inv_chirp_mul_device",
+           "bluestein_intrinsic_store_rc_inv_chirp_mul_device"};
+
+    BluesteinData data;
+};
+
+class BluesteinKernel
+{
+public:
+    BluesteinKernel(ComputeScheme     scheme,
+                    BluesteinFuseType type,
+                    int               direction,
+                    bool              planar_load,
+                    bool              planar_store,
+                    bool              intrinsic,
+                    bool              enable_scaling = false,
+                    const Expression& scale_factor   = Literal{"1"})
+        : scheme(scheme)
+        , type(type)
+        , direction(direction)
+        , planar_load(planar_load)
+        , planar_store(planar_store)
+        , intrinsic(intrinsic)
+        , enable_scaling(enable_scaling)
+        , scale_factor(scale_factor)
+    {
+    }
+
+    Function generate_device_load_function()
+    {
+        switch(type)
+        {
+        case BFT_NONE:
+            break;
+        case BFT_FWD_CHIRP:
+            if(scheme == CS_KERNEL_STOCKHAM_BLOCK_CC)
+                return generate_fwd_chirp_load_cc();
+            if(scheme == CS_KERNEL_STOCKHAM_BLOCK_RC)
+                return generate_fwd_chirp_load_rc();
+            break;
+        case BFT_FWD_CHIRP_MUL:
+            if(scheme == CS_KERNEL_STOCKHAM_BLOCK_CC)
+                return generate_fwd_chirp_mul_load_cc();
+            if(scheme == CS_KERNEL_STOCKHAM_BLOCK_RC)
+                return generate_fwd_chirp_mul_load_rc();
+            break;
+        case BFT_INV_CHIRP_MUL:
+            if(scheme == CS_KERNEL_STOCKHAM_BLOCK_CC)
+                return generate_inv_chirp_mul_load_cc();
+            if(scheme == CS_KERNEL_STOCKHAM_BLOCK_RC)
+                return generate_inv_chirp_mul_load_rc();
+            break;
+        }
+
+        throw std::runtime_error("unsupported bluestein fuse scheme");
+    }
+
+    Function generate_device_store_function()
+    {
+        switch(type)
+        {
+        case BFT_NONE:
+            break;
+        case BFT_FWD_CHIRP:
+            if(scheme == CS_KERNEL_STOCKHAM_BLOCK_CC)
+                return generate_fwd_chirp_store_cc();
+            if(scheme == CS_KERNEL_STOCKHAM_BLOCK_RC)
+                return generate_fwd_chirp_store_rc();
+            break;
+        case BFT_FWD_CHIRP_MUL:
+            if(scheme == CS_KERNEL_STOCKHAM_BLOCK_CC)
+                return generate_fwd_chirp_mul_store_cc();
+            if(scheme == CS_KERNEL_STOCKHAM_BLOCK_RC)
+                return generate_fwd_chirp_mul_store_rc();
+            break;
+        case BFT_INV_CHIRP_MUL:
+            if(scheme == CS_KERNEL_STOCKHAM_BLOCK_CC)
+                return generate_inv_chirp_mul_store_cc();
+            if(scheme == CS_KERNEL_STOCKHAM_BLOCK_RC)
+                return generate_inv_chirp_mul_store_rc();
+            break;
+        }
+
+        throw std::runtime_error("unsupported bluestein fuse scheme");
+    }
+
+private:
+    TemplateList get_template_list()
+    {
+        TemplateList tpls;
+        tpls.append(blueData.scalar_type);
+        tpls.append(blueData.callback_type);
+
+        return tpls;
+    }
+
+    void append_data_buf(ArgumentList& args, bool planar)
+    {
+        if(planar)
+        {
+            args.append(blueData.data_bufre);
+            args.append(blueData.data_bufim);
+        }
+        else
+        {
+            args.append(blueData.data_buf);
+        }
+    }
+
+    void append_data_index(ArgumentList& args, bool intrinsic)
+    {
+        if(intrinsic)
+        {
+            args.append(blueData.data_voffset);
+            args.append(blueData.data_soffset);
+            args.append(blueData.data_rw_flag);
+        }
+        else
+        {
+            args.append(blueData.data_idx);
+        }
+    }
+
+    std::unique_ptr<Expression> get_load_expression()
+    {
+        if(planar_load)
+        {
+            if(intrinsic)
+                return std::make_unique<Expression>(IntrinsicLoadPlanar({
+                    blueData.data_bufre,
+                    blueData.data_bufim,
+                    blueData.data_voffset,
+                    blueData.data_soffset,
+                    blueData.data_rw_flag,
+                }));
+            else
+                return std::make_unique<Expression>(LoadGlobalPlanar({
+                    blueData.data_bufre,
+                    blueData.data_bufim,
+                    blueData.data_idx,
+                }));
+        }
+        else
+        {
+            if(intrinsic)
+                return std::make_unique<Expression>(IntrinsicLoad({
+                    blueData.data_buf,
+                    blueData.data_voffset,
+                    blueData.data_soffset,
+                    blueData.data_rw_flag,
+                }));
+            else
+                return std::make_unique<Expression>(LoadGlobal{
+                    blueData.data_buf,
+                    blueData.data_idx,
+                });
+        }
+    }
+
+    std::unique_ptr<Expression> get_load_expression(const Expression& index)
+    {
+        if(planar_load)
+        {
+            if(intrinsic)
+                return std::make_unique<Expression>(IntrinsicLoadPlanar({
+                    blueData.data_bufre,
+                    blueData.data_bufim,
+                    index,
+                    0,
+                    blueData.data_rw_flag,
+                }));
+            else
+                return std::make_unique<Expression>(LoadGlobalPlanar({
+                    blueData.data_bufre,
+                    blueData.data_bufim,
+                    index,
+                }));
+        }
+        else
+        {
+            if(intrinsic)
+                return std::make_unique<Expression>(IntrinsicLoad({
+                    blueData.data_buf,
+                    index,
+                    0,
+                    blueData.data_rw_flag,
+                }));
+            else
+                return std::make_unique<Expression>(LoadGlobal{
+                    blueData.data_buf,
+                    index,
+                });
+        }
+    }
+
+    std::unique_ptr<Statement> get_store_statement()
+    {
+        if(planar_store)
+        {
+            if(intrinsic)
+            {
+                if(enable_scaling)
+                    return std::make_unique<Statement>(IntrinsicStorePlanar(blueData.data_bufre,
+                                                                            blueData.data_bufim,
+                                                                            blueData.data_voffset,
+                                                                            blueData.data_soffset,
+                                                                            blueData.data_elem,
+                                                                            blueData.data_rw_flag,
+                                                                            scale_factor));
+                else
+                    return std::make_unique<Statement>(IntrinsicStorePlanar(blueData.data_bufre,
+                                                                            blueData.data_bufim,
+                                                                            blueData.data_voffset,
+                                                                            blueData.data_soffset,
+                                                                            blueData.data_elem,
+                                                                            blueData.data_rw_flag,
+                                                                            std::nullopt));
+            }
+            else
+            {
+                if(enable_scaling)
+                    return std::make_unique<Statement>(StoreGlobalPlanar(blueData.data_bufre,
+                                                                         blueData.data_bufim,
+                                                                         blueData.data_idx,
+                                                                         blueData.data_elem,
+                                                                         scale_factor));
+                else
+                    return std::make_unique<Statement>(StoreGlobalPlanar(blueData.data_bufre,
+                                                                         blueData.data_bufim,
+                                                                         blueData.data_idx,
+                                                                         blueData.data_elem,
+                                                                         std::nullopt));
+            }
+        }
+        else
+        {
+            if(intrinsic)
+                return std::make_unique<Statement>(IntrinsicStore(blueData.data_buf,
+                                                                  blueData.data_voffset,
+                                                                  blueData.data_soffset,
+                                                                  blueData.data_elem,
+                                                                  blueData.data_rw_flag));
+            else
+                return std::make_unique<Statement>(
+                    StoreGlobal(blueData.data_buf, blueData.data_idx, blueData.data_elem));
+        }
+    }
+
+    std::unique_ptr<Statement> get_store_statement(const Expression& index)
+    {
+        if(planar_store)
+        {
+            if(intrinsic)
+            {
+                if(enable_scaling)
+                    return std::make_unique<Statement>(IntrinsicStorePlanar(blueData.data_bufre,
+                                                                            blueData.data_bufim,
+                                                                            index,
+                                                                            0,
+                                                                            blueData.data_elem,
+                                                                            blueData.data_rw_flag,
+                                                                            scale_factor));
+                else
+                    return std::make_unique<Statement>(IntrinsicStorePlanar(blueData.data_bufre,
+                                                                            blueData.data_bufim,
+                                                                            index,
+                                                                            0,
+                                                                            blueData.data_elem,
+                                                                            blueData.data_rw_flag,
+                                                                            std::nullopt));
+            }
+            else
+            {
+                if(enable_scaling)
+                    return std::make_unique<Statement>(StoreGlobalPlanar(blueData.data_bufre,
+                                                                         blueData.data_bufim,
+                                                                         index,
+                                                                         blueData.data_elem,
+                                                                         scale_factor));
+                else
+                    return std::make_unique<Statement>(StoreGlobalPlanar(blueData.data_bufre,
+                                                                         blueData.data_bufim,
+                                                                         index,
+                                                                         blueData.data_elem,
+                                                                         std::nullopt));
+            }
+        }
+        else
+        {
+            if(intrinsic)
+                return std::make_unique<Statement>(IntrinsicStore(
+                    blueData.data_buf, index, 0, blueData.data_elem, blueData.data_rw_flag));
+            else
+                return std::make_unique<Statement>(
+                    StoreGlobal(blueData.data_buf, index, blueData.data_elem));
+        }
+    }
+
+    Function generate_fwd_chirp_load_cc()
+    {
+        Function f{intrinsic ? function.get_intrinsic_op_name(BFN_LOAD_CC_FWD_CHIRP)
+                             : function.get_op_name(BFN_LOAD_CC_FWD_CHIRP)};
+
+        f.templates = get_template_list();
+        ArgumentList args;
+        args.append(blueData.chirp);
+        args.append(blueData.transform_idx);
+        if(intrinsic)
+            args.append(blueData.data_rw_flag);
+        args.append(blueData.length_N_blue);
+        args.append(blueData.length_M_blue);
+        f.arguments   = args;
+        f.return_type = "scalar_type";
+        f.qualifier   = "__device__";
+
+        StatementList& body = f.body;
+        if(intrinsic)
+        {
+            body += If{(blueData.transform_idx < blueData.length_N_blue) && blueData.data_rw_flag,
+                       {
+                           ReturnExpr(blueData.chirp[blueData.transform_idx]),
+                       }};
+            body += ElseIf{
+                (blueData.transform_idx >= blueData.length_M_blue - blueData.length_N_blue + 1)
+                    && blueData.data_rw_flag,
+                {
+                    ReturnExpr(blueData.chirp[blueData.length_M_blue - blueData.transform_idx]),
+                }};
+            body += Else{{
+                ReturnExpr(CallExpr{"scalar_type", {0, 0}}),
+            }};
+        }
+        else
+        {
+            body += If{blueData.transform_idx < blueData.length_N_blue,
+                       {
+                           ReturnExpr(blueData.chirp[blueData.transform_idx]),
+                       }};
+            body += ElseIf{
+                blueData.transform_idx >= blueData.length_M_blue - blueData.length_N_blue + 1,
+                {
+                    ReturnExpr(blueData.chirp[blueData.length_M_blue - blueData.transform_idx]),
+                }};
+            body += Else{{
+                ReturnExpr(CallExpr{"scalar_type", {0, 0}}),
+            }};
+        }
+
+        return f;
+    }
+
+    Function generate_fwd_chirp_load_rc()
+    {
+        Function f{intrinsic ? function.get_intrinsic_op_name(BFN_LOAD_RC_FWD_CHIRP)
+                             : function.get_op_name(BFN_LOAD_RC_FWD_CHIRP)};
+
+        f.templates = get_template_list();
+        ArgumentList args;
+        append_data_index(args, intrinsic);
+        append_data_buf(args, planar_load);
+        args.append(blueData.load_cb_fn);
+        args.append(blueData.load_cb_data);
+        f.arguments   = args;
+        f.return_type = "scalar_type";
+        f.qualifier   = "__device__";
+
+        auto load_expression = get_load_expression();
+
+        StatementList& body = f.body;
+        body += CallbackLoadDeclaration{blueData.scalar_type.render(),
+                                        blueData.callback_type.render()};
+        body += ReturnExpr{*load_expression};
+
+        return f;
+    }
+
+    Function generate_fwd_chirp_mul_load_cc()
+    {
+        Function f{intrinsic ? function.get_intrinsic_op_name(BFN_LOAD_CC_FWD_CHIRP_MUL)
+                             : function.get_op_name(BFN_LOAD_CC_FWD_CHIRP_MUL)};
+
+        f.templates = get_template_list();
+        ArgumentList args;
+        args.append(blueData.chirp);
+        args.append(blueData.transform_idx);
+        append_data_index(args, intrinsic);
+        args.append(blueData.length_N_blue);
+        append_data_buf(args, planar_load);
+        args.append(blueData.load_cb_fn);
+        args.append(blueData.load_cb_data);
+        f.arguments   = args;
+        f.return_type = "scalar_type";
+        f.qualifier   = "__device__";
+
+        Variable elem_scalar{"elem_scalar", "scalar_type"};
+        Variable aux_real{"aux_real", "real_type_t<scalar_type>"};
+
+        auto load_expression = get_load_expression();
+
+        std::unique_ptr<Expression> mul_assign_expression_x, mul_assign_expression_y;
+        if(direction == -1) // forward
+        {
+            mul_assign_expression_x = std::make_unique<Expression>(
+                elem_scalar.x() * blueData.chirp[blueData.transform_idx].x()
+                + elem_scalar.y() * blueData.chirp[blueData.transform_idx].y());
+            mul_assign_expression_y = std::make_unique<Expression>(
+                -aux_real * blueData.chirp[blueData.transform_idx].y()
+                + elem_scalar.y() * blueData.chirp[blueData.transform_idx].x());
+        }
+        else if(direction == +1) // inverse
+        {
+            mul_assign_expression_x = std::make_unique<Expression>(
+                elem_scalar.x() * blueData.chirp[blueData.transform_idx].x()
+                - elem_scalar.y() * blueData.chirp[blueData.transform_idx].y());
+            mul_assign_expression_y = std::make_unique<Expression>(
+                -aux_real * blueData.chirp[blueData.transform_idx].y()
+                - elem_scalar.y() * blueData.chirp[blueData.transform_idx].x());
+        }
+
+        StatementList& body = f.body;
+        body += CallbackLoadDeclaration{blueData.scalar_type.render(),
+                                        blueData.callback_type.render()};
+        body += If{blueData.transform_idx >= blueData.length_N_blue,
+                   {
+                       Call{"return", {CallExpr{"scalar_type", {0, 0}}}},
+                   }};
+        body += Else{{
+            Declaration{elem_scalar},
+            Declaration{aux_real},
+            Assign{elem_scalar, *load_expression},
+            Assign{aux_real, elem_scalar.x()},
+            Assign{elem_scalar.x(), *mul_assign_expression_x},
+            Assign{elem_scalar.y(), *mul_assign_expression_y},
+            ReturnExpr(elem_scalar),
+        }};
+
+        return f;
+    }
+
+    Function generate_fwd_chirp_mul_load_rc()
+    {
+        Function f{intrinsic ? function.get_intrinsic_op_name(BFN_LOAD_RC_FWD_CHIRP_MUL)
+                             : function.get_op_name(BFN_LOAD_RC_FWD_CHIRP_MUL)};
+
+        f.templates = get_template_list();
+        ArgumentList args;
+        append_data_index(args, intrinsic);
+        append_data_buf(args, planar_load);
+        args.append(blueData.load_cb_fn);
+        args.append(blueData.load_cb_data);
+        f.arguments   = args;
+        f.return_type = "scalar_type";
+        f.qualifier   = "__device__";
+
+        auto load_expression = get_load_expression();
+
+        StatementList& body = f.body;
+        body += CallbackLoadDeclaration{blueData.scalar_type.render(),
+                                        blueData.callback_type.render()};
+        body += ReturnExpr(*load_expression);
+
+        return f;
+    }
+
+    Function generate_inv_chirp_mul_load_cc()
+    {
+        Function f{intrinsic ? function.get_intrinsic_op_name(BFN_LOAD_CC_INV_CHIRP_MUL)
+                             : function.get_op_name(BFN_LOAD_CC_INV_CHIRP_MUL)};
+
+        f.templates = get_template_list();
+        ArgumentList args;
+        args.append(blueData.transform_idx);
+        append_data_index(args, intrinsic);
+        args.append(blueData.length_M_blue);
+        append_data_buf(args, planar_load);
+        args.append(blueData.load_cb_fn);
+        args.append(blueData.load_cb_data);
+        f.arguments   = args;
+        f.return_type = "scalar_type";
+        f.qualifier   = "__device__";
+
+        Variable aux_scalar{"aux_scalar", "scalar_type"};
+        Variable elem_scalar{"elem_scalar", "scalar_type"};
+        Variable aux_real{"aux_real", "real_type_t<scalar_type>"};
+
+        auto load_expression_1 = get_load_expression(blueData.transform_idx);
+
+        std::unique_ptr<Expression> load_expression_2;
+        if(intrinsic)
+            load_expression_2 = get_load_expression(blueData.data_voffset + blueData.data_soffset
+                                                    + blueData.length_M_blue);
+        else
+            load_expression_2 = get_load_expression(blueData.data_idx + blueData.length_M_blue);
+
+        StatementList& body = f.body;
+        body += CallbackLoadDeclaration{blueData.scalar_type.render(),
+                                        blueData.callback_type.render()};
+        body += Declaration{elem_scalar};
+        body += Declaration{aux_scalar};
+        body += Declaration{aux_real};
+        body += Assign{elem_scalar, *load_expression_1};
+        body += Assign{aux_scalar, *load_expression_2};
+        body += Assign{aux_real, elem_scalar.x()};
+        body += Assign{elem_scalar.x(),
+                       elem_scalar.x() * aux_scalar.x() - elem_scalar.y() * aux_scalar.y()};
+        body += Assign{elem_scalar.y(),
+                       aux_real * aux_scalar.y() + elem_scalar.y() * aux_scalar.x()};
+
+        body += ReturnExpr(elem_scalar);
+
+        return f;
+    }
+
+    Function generate_inv_chirp_mul_load_rc()
+    {
+        Function f{intrinsic ? function.get_intrinsic_op_name(BFN_LOAD_RC_INV_CHIRP_MUL)
+                             : function.get_op_name(BFN_LOAD_RC_INV_CHIRP_MUL)};
+
+        f.templates = get_template_list();
+        ArgumentList args;
+        append_data_index(args, intrinsic);
+        append_data_buf(args, planar_load);
+        args.append(blueData.load_cb_fn);
+        args.append(blueData.load_cb_data);
+        f.arguments   = args;
+        f.return_type = "scalar_type";
+        f.qualifier   = "__device__";
+
+        auto load_expression = get_load_expression();
+
+        StatementList& body = f.body;
+        body += CallbackLoadDeclaration{blueData.scalar_type.render(),
+                                        blueData.callback_type.render()};
+        body += ReturnExpr(*load_expression);
+
+        return f;
+    }
+
+    Function generate_fwd_chirp_store_cc()
+    {
+        Function f{intrinsic ? function.get_intrinsic_op_name(BFN_STORE_CC_FWD_CHIRP)
+                             : function.get_op_name(BFN_STORE_CC_FWD_CHIRP)};
+
+        f.templates = get_template_list();
+        ArgumentList args;
+        append_data_index(args, intrinsic);
+        append_data_buf(args, planar_store);
+        args.append(blueData.data_elem);
+        args.append(blueData.store_cb_fn);
+        args.append(blueData.store_cb_data);
+        f.arguments = args;
+        f.qualifier = "__device__";
+
+        auto store_statement = get_store_statement();
+
+        StatementList& body = f.body;
+        body += CallbackStoreDeclaration{blueData.scalar_type.render(),
+                                         blueData.callback_type.render()};
+        body += *store_statement;
+
+        return f;
+    }
+
+    Function generate_fwd_chirp_store_rc()
+    {
+        Function f{intrinsic ? function.get_intrinsic_op_name(BFN_STORE_RC_FWD_CHIRP)
+                             : function.get_op_name(BFN_STORE_RC_FWD_CHIRP)};
+
+        f.templates = get_template_list();
+        ArgumentList args;
+        append_data_index(args, intrinsic);
+        append_data_buf(args, planar_store);
+        args.append(blueData.data_elem);
+        args.append(blueData.store_cb_fn);
+        args.append(blueData.store_cb_data);
+        f.arguments = args;
+        f.qualifier = "__device__";
+
+        auto store_statement = get_store_statement();
+
+        StatementList& body = f.body;
+        body += CallbackStoreDeclaration{blueData.scalar_type.render(),
+                                         blueData.callback_type.render()};
+        body += *store_statement;
+
+        return f;
+    }
+
+    Function generate_fwd_chirp_mul_store_cc()
+    {
+        Function f{intrinsic ? function.get_intrinsic_op_name(BFN_STORE_CC_FWD_CHIRP_MUL)
+                             : function.get_op_name(BFN_STORE_CC_FWD_CHIRP_MUL)};
+
+        f.templates = get_template_list();
+        ArgumentList args;
+        append_data_index(args, intrinsic);
+        append_data_buf(args, planar_store);
+        args.append(blueData.data_elem);
+        args.append(blueData.store_cb_fn);
+        args.append(blueData.store_cb_data);
+        f.arguments = args;
+        f.qualifier = "__device__";
+
+        auto store_statement = get_store_statement();
+
+        StatementList& body = f.body;
+        body += CallbackStoreDeclaration{blueData.scalar_type.render(),
+                                         blueData.callback_type.render()};
+        body += *store_statement;
+
+        return f;
+    }
+
+    Function generate_fwd_chirp_mul_store_rc()
+    {
+        Function f{intrinsic ? function.get_intrinsic_op_name(BFN_STORE_RC_FWD_CHIRP_MUL)
+                             : function.get_op_name(BFN_STORE_RC_FWD_CHIRP_MUL)};
+
+        f.templates = get_template_list();
+        ArgumentList args;
+        append_data_index(args, intrinsic);
+        args.append(blueData.length_M_blue);
+        append_data_buf(args, planar_store);
+        args.append(blueData.data_elem);
+        args.append(blueData.store_cb_fn);
+        args.append(blueData.store_cb_data);
+        f.arguments = args;
+        f.qualifier = "__device__";
+
+        std::unique_ptr<Statement> store_statement;
+        if(intrinsic)
+            store_statement = get_store_statement(blueData.data_voffset + blueData.data_soffset
+                                                  + blueData.length_M_blue);
+        else
+            store_statement = get_store_statement(blueData.data_idx + blueData.length_M_blue);
+
+        StatementList& body = f.body;
+        body += CallbackStoreDeclaration{blueData.scalar_type.render(),
+                                         blueData.callback_type.render()};
+        body += *store_statement;
+
+        return f;
+    }
+
+    Function generate_inv_chirp_mul_store_cc()
+    {
+        Function f{intrinsic ? function.get_intrinsic_op_name(BFN_STORE_CC_INV_CHIRP_MUL)
+                             : function.get_op_name(BFN_STORE_CC_INV_CHIRP_MUL)};
+
+        f.templates = get_template_list();
+        ArgumentList args;
+        append_data_index(args, intrinsic);
+        append_data_buf(args, planar_store);
+        args.append(blueData.data_elem);
+        args.append(blueData.store_cb_fn);
+        args.append(blueData.store_cb_data);
+        f.arguments = args;
+        f.qualifier = "__device__";
+
+        auto store_statement = get_store_statement();
+
+        StatementList& body = f.body;
+        body += CallbackStoreDeclaration{blueData.scalar_type.render(),
+                                         blueData.callback_type.render()};
+        body += *store_statement;
+
+        return f;
+    }
+
+    Function generate_inv_chirp_mul_store_rc()
+    {
+        Function f{intrinsic ? function.get_intrinsic_op_name(BFN_STORE_RC_INV_CHIRP_MUL)
+                             : function.get_op_name(BFN_STORE_RC_INV_CHIRP_MUL)};
+
+        f.templates = get_template_list();
+        ArgumentList args;
+        args.append(blueData.chirp);
+        args.append(blueData.transform_idx);
+        append_data_index(args, intrinsic);
+        args.append(blueData.length_N_blue);
+        args.append(blueData.length_M_blue);
+        append_data_buf(args, planar_store);
+        args.append(blueData.data_elem);
+        args.append(blueData.store_cb_fn);
+        args.append(blueData.store_cb_data);
+        f.arguments = args;
+        f.qualifier = "__device__";
+
+        Variable aux_real{"aux_real", "real_type_t<scalar_type>"};
+
+        auto store_statement = get_store_statement();
+
+        std::unique_ptr<Expression> mul_assign_expression;
+        if(direction == +1) // inverse
+            mul_assign_expression = std::make_unique<Expression>(
+                -aux_real * blueData.chirp[blueData.transform_idx].y()
+                + blueData.data_elem.y() * blueData.chirp[blueData.transform_idx].x());
+        else if(direction == -1) // forward
+            mul_assign_expression = std::make_unique<Expression>(
+                aux_real * blueData.chirp[blueData.transform_idx].y()
+                - blueData.data_elem.y() * blueData.chirp[blueData.transform_idx].x());
+
+        StatementList& body = f.body;
+        body += CallbackStoreDeclaration{blueData.scalar_type.render(),
+                                         blueData.callback_type.render()};
+        body += If{
+            blueData.transform_idx < blueData.length_N_blue,
+            {
+                Assign{blueData.data_elem,
+                       blueData.data_elem
+                           * Parens{Literal{"1.0 / (real_type_t<scalar_type>) "
+                                            + blueData.length_M_blue.render()}}},
+                Declaration{aux_real},
+                Assign{aux_real, blueData.data_elem.x()},
+                Assign{blueData.data_elem.x(),
+                       blueData.data_elem.x() * blueData.chirp[blueData.transform_idx].x()
+                           + blueData.data_elem.y() * blueData.chirp[blueData.transform_idx].y()},
+                Assign{blueData.data_elem.y(), *mul_assign_expression},
+                *store_statement,
+            }};
+
+        return f;
+    }
+
+    ComputeScheme     scheme;
+    BluesteinFuseType type;
+    BluesteinData     blueData;
+    int               direction;
+    bool              planar_load;
+    bool              planar_store;
+    bool              intrinsic;
+    bool              enable_scaling;
+    Expression        scale_factor;
+    BluesteinFunction function;
+};
+
+static Function generate_bluestein_device_load_function(const ComputeScheme     scheme,
+                                                        const BluesteinFuseType type,
+                                                        int                     direction,
+                                                        bool                    planar,
+                                                        bool                    intrinsic)
+{
+    auto blueKernel = BluesteinKernel(scheme, type, direction, planar, false, intrinsic);
+    return blueKernel.generate_device_load_function();
+}
+
+static Function generate_bluestein_device_store_function(const ComputeScheme     scheme,
+                                                         const BluesteinFuseType type,
+                                                         int                     direction,
+                                                         bool                    planar,
+                                                         bool                    intrinsic)
+{
+    auto blueKernel = BluesteinKernel(scheme, type, direction, false, planar, intrinsic);
+    return blueKernel.generate_device_store_function();
+}
+
+struct MakeBluesteinVisitor : public BaseVisitor
+{
+    MakeBluesteinVisitor()
+        : BaseVisitor()
+    {
+    }
+
+    Function visit_Function(const Function& x) override
+    {
+        Function y{x};
+
+        y.arguments.append(blueData.length_N_blue);
+        y.arguments.append(blueData.length_M_blue);
+        y.arguments.append(blueData.global_stride_in_0);
+        y.arguments.append(blueData.global_stride_in_1);
+        y.arguments.append(blueData.global_idist);
+        y.arguments.append(blueData.global_stride_out_0);
+        y.arguments.append(blueData.global_stride_out_1);
+        y.arguments.append(blueData.global_odist);
+
+        return BaseVisitor::visit_Function(y);
+    }
+
+    BluesteinData          blueData;
+    BluesteinFunction      blueFunction;
+    BluesteinOperationType load_op;
+    BluesteinOperationType store_op;
+};
+
+struct MakeBluesteinCCVisitor : public MakeBluesteinVisitor
+{
+    MakeBluesteinCCVisitor()
+        : MakeBluesteinVisitor()
+    {
+    }
+
+    Expression visit_LoadGlobal(const LoadGlobal& x) override
+    {
+        return blueFunction.get_load_op(load_op, x, false);
+    }
+
+    Expression visit_LoadGlobalPlanar(const LoadGlobalPlanar& x) override
+    {
+        return blueFunction.get_load_op(load_op, x, true);
+    }
+
+    Expression visit_IntrinsicLoad(const IntrinsicLoad& x) override
+    {
+        return blueFunction.get_load_op(load_op, x, false);
+    }
+
+    Expression visit_IntrinsicLoadPlanar(const IntrinsicLoadPlanar& x) override
+    {
+        return blueFunction.get_load_op(load_op, x, true);
+    }
+
+    StatementList visit_StoreGlobal(const StoreGlobal& x) override
+    {
+        auto stmts = StatementList();
+        stmts += blueFunction.get_store_op(store_op, x, false);
+        return stmts;
+    }
+
+    StatementList visit_StoreGlobalPlanar(const StoreGlobalPlanar& x) override
+    {
+        auto stmts = StatementList();
+        stmts += blueFunction.get_store_op(store_op, x, true);
+        return stmts;
+    }
+
+    StatementList visit_IntrinsicStore(const IntrinsicStore& x) override
+    {
+        auto stmts = StatementList();
+        stmts += blueFunction.get_store_op(store_op, x, false);
+        return stmts;
+    }
+
+    StatementList visit_IntrinsicStorePlanar(const IntrinsicStorePlanar& x) override
+    {
+        auto stmts = StatementList();
+        stmts += blueFunction.get_store_op(store_op, x, true);
+        return stmts;
+    }
+};
+
+struct MakeBluesteinRCVisitor : public MakeBluesteinVisitor
+{
+    MakeBluesteinRCVisitor()
+        : MakeBluesteinVisitor()
+    {
+    }
+
+    Function visit_Function(const Function& x) override
+    {
+        Function y{x};
+
+        return MakeBluesteinVisitor::visit_Function(y);
+    }
+
+    Expression visit_LoadGlobal(const LoadGlobal& x) override
+    {
+        return blueFunction.get_load_op(load_op, x, false);
+    }
+
+    Expression visit_LoadGlobalPlanar(const LoadGlobalPlanar& x) override
+    {
+        return blueFunction.get_load_op(load_op, x, true);
+    }
+
+    StatementList visit_StoreGlobal(const StoreGlobal& x) override
+    {
+        auto stmts = StatementList();
+        stmts += blueFunction.get_store_op(store_op, x, false);
+        return stmts;
+    }
+
+    StatementList visit_StoreGlobalPlanar(const StoreGlobalPlanar& x) override
+    {
+        auto stmts = StatementList();
+        stmts += blueFunction.get_store_op(store_op, x, true);
+        return stmts;
+    }
+};
+
+struct MakeBluesteinFwdChirpCCVisitor : public MakeBluesteinCCVisitor
+{
+    MakeBluesteinFwdChirpCCVisitor()
+        : MakeBluesteinCCVisitor()
+    {
+        load_op  = BFN_LOAD_CC_FWD_CHIRP;
+        store_op = BFN_STORE_CC_FWD_CHIRP;
+    }
+
+    Function visit_Function(const Function& x) override
+    {
+        Function y{x};
+        y.arguments.append(blueData.chirp);
+
+        return MakeBluesteinCCVisitor::visit_Function(y);
+    }
+};
+
+Function make_bluestein_fwd_chirp_cc(const Function& f)
+{
+    auto visitor = MakeBluesteinFwdChirpCCVisitor();
+    return visitor(f);
+}
+
+struct MakeBluesteinFwdChirpRCVisitor : public MakeBluesteinRCVisitor
+{
+    MakeBluesteinFwdChirpRCVisitor()
+        : MakeBluesteinRCVisitor()
+    {
+        load_op  = BFN_LOAD_RC_FWD_CHIRP;
+        store_op = BFN_STORE_RC_FWD_CHIRP;
+    }
+};
+
+Function make_bluestein_fwd_chirp_rc(const Function& f)
+{
+    auto visitor = MakeBluesteinFwdChirpRCVisitor();
+    return visitor(f);
+}
+
+struct MakeBluesteinFwdChirpMulCCVisitor : public MakeBluesteinCCVisitor
+{
+    MakeBluesteinFwdChirpMulCCVisitor()
+        : MakeBluesteinCCVisitor()
+    {
+        load_op  = BFN_LOAD_CC_FWD_CHIRP_MUL;
+        store_op = BFN_STORE_CC_FWD_CHIRP_MUL;
+    }
+
+    Function visit_Function(const Function& x) override
+    {
+        Function y{x};
+        y.arguments.append(blueData.chirp);
+
+        return MakeBluesteinCCVisitor::visit_Function(y);
+    }
+};
+
+Function make_bluestein_fwd_chirp_mul_cc(const Function& f)
+{
+    auto visitor = MakeBluesteinFwdChirpMulCCVisitor();
+    return visitor(f);
+}
+
+struct MakeBluesteinFwdChirpMulRCVisitor : public MakeBluesteinRCVisitor
+{
+    MakeBluesteinFwdChirpMulRCVisitor()
+        : MakeBluesteinRCVisitor()
+    {
+        load_op  = BFN_LOAD_RC_FWD_CHIRP_MUL;
+        store_op = BFN_STORE_RC_FWD_CHIRP_MUL;
+    }
+};
+
+Function make_bluestein_fwd_chirp_mul_rc(const Function& f)
+{
+    auto visitor = MakeBluesteinFwdChirpMulRCVisitor();
+    return visitor(f);
+}
+
+struct MakeBluesteinInvChirpMulCCVisitor : public MakeBluesteinCCVisitor
+{
+    MakeBluesteinInvChirpMulCCVisitor()
+        : MakeBluesteinCCVisitor()
+    {
+        load_op  = BFN_LOAD_CC_INV_CHIRP_MUL;
+        store_op = BFN_STORE_CC_INV_CHIRP_MUL;
+    }
+};
+
+Function make_bluestein_inv_chirp_mul_cc(const Function& f)
+{
+    auto visitor = MakeBluesteinInvChirpMulCCVisitor();
+    return visitor(f);
+}
+
+struct MakeBluesteinInvChirpMulRCVisitor : public MakeBluesteinRCVisitor
+{
+    MakeBluesteinInvChirpMulRCVisitor()
+        : MakeBluesteinRCVisitor()
+    {
+        load_op  = BFN_LOAD_RC_INV_CHIRP_MUL;
+        store_op = BFN_STORE_RC_INV_CHIRP_MUL;
+    }
+
+    Function visit_Function(const Function& x) override
+    {
+        Function y{x};
+        y.arguments.append(blueData.chirp);
+
+        return MakeBluesteinRCVisitor::visit_Function(y);
+    }
+};
+
+Function make_bluestein_inv_chirp_mul_rc(const Function& f)
+{
+    auto visitor = MakeBluesteinInvChirpMulRCVisitor();
+    return visitor(f);
+}
+
+static Function
+    make_bluestein(const ComputeScheme scheme, const BluesteinFuseType type, const Function& f)
+{
+    switch(type)
+    {
+    case BFT_NONE:
+        break;
+    case BFT_FWD_CHIRP:
+        if(scheme == CS_KERNEL_STOCKHAM_BLOCK_CC)
+            return make_bluestein_fwd_chirp_cc(f);
+
+        if(scheme == CS_KERNEL_STOCKHAM_BLOCK_RC)
+            return make_bluestein_fwd_chirp_rc(f);
+
+        break;
+    case BFT_FWD_CHIRP_MUL:
+        if(scheme == CS_KERNEL_STOCKHAM_BLOCK_CC)
+            return make_bluestein_fwd_chirp_mul_cc(f);
+
+        if(scheme == CS_KERNEL_STOCKHAM_BLOCK_RC)
+            return make_bluestein_fwd_chirp_mul_rc(f);
+
+        break;
+    case BFT_INV_CHIRP_MUL:
+        if(scheme == CS_KERNEL_STOCKHAM_BLOCK_CC)
+            return make_bluestein_inv_chirp_mul_cc(f);
+
+        if(scheme == CS_KERNEL_STOCKHAM_BLOCK_RC)
+            return make_bluestein_inv_chirp_mul_rc(f);
+
+        break;
+    }
+
+    throw std::runtime_error("unsupported bluestein fuse scheme");
+}
\ No newline at end of file
diff -Nru rocfft-5.5.0/library/src/device/generator/fftgenerator.h rocfft-5.7.1/library/src/device/generator/fftgenerator.h
--- rocfft-5.5.0/library/src/device/generator/fftgenerator.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/generator/fftgenerator.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -543,7 +543,7 @@
             // auto tidx = params.nheight - 1 + w - 1 + (params.width - 1) * (tid % params.nheight);
             auto ridx  = h * params.width + w;
             auto theta = Literal(params.nheight); // XXX
-            stmts += Call("sincospi", {theta, t.y, t.x}); // XXX need address of
+            stmts += Call("sincospi", {theta, t.y(), t.x()}); // XXX need address of
             stmts += Assign(t, TwiddleMultiply(t, R[ridx]));
             stmts += Assign(R[ridx], t);
         }
@@ -675,9 +675,9 @@
             if(component == Component::BOTH)
                 stmts += Assign(dst[idx], src[h * params.width + w]);
             else if(component == Component::REAL)
-                stmts += Assign(dst[idx], src[h * params.width + w].x);
+                stmts += Assign(dst[idx], src[h * params.width + w].x());
             else if(component == Component::IMAG)
-                stmts += Assign(dst[idx], src[h * params.width + w].y);
+                stmts += Assign(dst[idx], src[h * params.width + w].y());
         }
 
         return stmts;
@@ -714,9 +714,9 @@
             if(component == Component::BOTH)
                 stmts += Assign(dst[h * params.width + w], src[idx]);
             else if(component == Component::REAL)
-                stmts += Assign(dst[h * params.width + w].x, src[idx]);
+                stmts += Assign(dst[h * params.width + w].x(), src[idx]);
             else if(component == Component::IMAG)
-                stmts += Assign(dst[h * params.width + w].y, src[idx]);
+                stmts += Assign(dst[h * params.width + w].y(), src[idx]);
         }
 
         return stmts;
@@ -752,9 +752,9 @@
             if(component == Component::BOTH)
                 stmts += Assign(dst[h * params.width + w], src[idx]);
             else if(component == Component::REAL)
-                stmts += Assign(dst[h * params.width + w].x, src[idx]);
+                stmts += Assign(dst[h * params.width + w].x(), src[idx]);
             else if(component == Component::IMAG)
-                stmts += Assign(dst[h * params.width + w].y, src[idx]);
+                stmts += Assign(dst[h * params.width + w].y(), src[idx]);
         }
 
         return stmts;
@@ -891,16 +891,17 @@
             auto tid = params.thread + h * params.threads_per_transform;
             auto idx = tid + w * (params.length / params.width);
 
-            stmts
-                += If{idx < length,
-                      {
-                          Declaration{in_elem},
-                          Assign{in_elem, src.load_global(idx)},
-                          Assign{dst[idx].x, in_elem.x * chirp[idx].x + in_elem.y * chirp[idx].y},
-                          Assign{dst[idx].y, -in_elem.x * chirp[idx].y + in_elem.y * chirp[idx].x},
-                      }};
+            stmts += If{idx < length,
+                        {
+                            Declaration{in_elem},
+                            Assign{in_elem, src.load_global(idx)},
+                            Assign{dst[idx].x(),
+                                   in_elem.x() * chirp[idx].x() + in_elem.y() * chirp[idx].y()},
+                            Assign{dst[idx].y(),
+                                   -in_elem.x() * chirp[idx].y() + in_elem.y() * chirp[idx].x()},
+                        }};
             stmts += Else{{
-                Assign{dst[idx], CallExpr{"lib_make_vector2<scalar_type>", {0, 0}}},
+                Assign{dst[idx], CallExpr{"scalar_type", {0, 0}}},
             }};
         }
 
@@ -945,8 +946,8 @@
             auto idx = tid + w * (params.length / params.width);
 
             stmts += Assign{elem, srcA[idx]};
-            stmts += Assign{dst[idx].x, srcB[idx].x * elem.x - srcB[idx].y * elem.y};
-            stmts += Assign{dst[idx].y, srcB[idx].x * elem.y + srcB[idx].y * elem.x};
+            stmts += Assign{dst[idx].x(), srcB[idx].x() * elem.x() - srcB[idx].y() * elem.y()};
+            stmts += Assign{dst[idx].y(), srcB[idx].x() * elem.y() + srcB[idx].y() * elem.x()};
         }
         return stmts;
     }
@@ -1000,10 +1001,10 @@
             auto idx = tid + w * (params.length / params.width);
 
             If write_cond{idx < length, {}};
-            write_cond.body
-                += Assign{elem.x, MI * (src[idx].x * chirp[idx].x + src[idx].y * chirp[idx].y)};
-            write_cond.body
-                += Assign{elem.y, MI * (-src[idx].x * chirp[idx].y + src[idx].y * chirp[idx].x)};
+            write_cond.body += Assign{
+                elem.x(), MI * (src[idx].x() * chirp[idx].x() + src[idx].y() * chirp[idx].y())};
+            write_cond.body += Assign{
+                elem.y(), MI * (-src[idx].x() * chirp[idx].y() + src[idx].y() * chirp[idx].x())};
             if(enable_scaling)
                 write_cond.body += MultiplyAssign(elem, scale_factor);
             write_cond.body += dst.store_global(idx, elem);
diff -Nru rocfft-5.5.0/library/src/device/generator/generator.cpp rocfft-5.7.1/library/src/device/generator/generator.cpp
--- rocfft-5.5.0/library/src/device/generator/generator.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/generator/generator.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -178,9 +178,15 @@
     return "load_cb(" + vrender(args[0]) + "," + vrender(args[1]) + ", load_cb_data, nullptr)";
 }
 
-std::string ScalarVariable::render() const
+LoadGlobalPlanar::LoadGlobalPlanar(const std::vector<Expression>& args)
+    : args(args)
 {
-    return name;
+}
+
+std::string LoadGlobalPlanar::render() const
+{
+    return "{" + vrender(args[0]) + "[" + vrender(args[2]) + "]" + "," + vrender(args[1]) + "["
+           + vrender(args[2]) + "]" + "}";
 }
 
 std::string ArgumentList::render_decl() const
@@ -214,9 +220,6 @@
     , type(_type)
     , pointer(pointer)
     , restrict(restrict)
-    , x(_name + ".x", _type, Component::REAL)
-    , y(_name + ".y", _type, Component::IMAG)
-    , component(Component::BOTH)
 {
     if(size > 0)
         this->size = Expression{size};
@@ -231,9 +234,6 @@
     , type(_type)
     , pointer(pointer)
     , restrict(restrict)
-    , x(_name + ".x", _type, Component::REAL)
-    , y(_name + ".y", _type, Component::IMAG)
-    , component(Component::BOTH)
     , size(_size)
 {
 }
@@ -247,8 +247,6 @@
     , type(v.type)
     , pointer(v.pointer)
     , restrict(v.restrict)
-    , x(v.x)
-    , y(v.y)
     , component(v.component)
     , index(v.index)
     , index2D(v.index2D)
@@ -256,11 +254,6 @@
     , size2D(v.size2D)
     , decl_default(v.decl_default)
 {
-    if(index)
-    {
-        x.name = v.name + "[" + vrender(*index) + "].x";
-        y.name = v.name + "[" + vrender(*index) + "].y";
-    }
 }
 
 Variable::Variable(const Variable& v, const Expression& _index)
@@ -268,16 +261,12 @@
     , type(v.type)
     , pointer(v.pointer)
     , restrict(v.restrict)
-    , x(v.x)
-    , y(v.y)
     , component(v.component)
     , index(_index)
 {
     size         = v.size;
     size2D       = v.size2D;
     decl_default = v.decl_default;
-    x.name       = v.name + "[" + vrender(*index) + "].x";
-    y.name       = v.name + "[" + vrender(*index) + "].y";
 }
 
 Variable::Variable(const Variable& v, const Expression& _index, const Expression& _index2D)
@@ -286,25 +275,46 @@
     index2D = _index2D;
 }
 
-ScalarVariable Variable::address() const
+Variable Variable::address() const
 {
     if(index)
     {
-        return ScalarVariable("&" + name + "[" + vrender(*index) + "]", type + "*");
+        return Variable("&" + name + "[" + vrender(*index) + "]", type + "*");
     }
-    return ScalarVariable("&" + name, type + "*");
+    return Variable("&" + name, type + "*");
+}
+
+Variable Variable::x() const
+{
+    Variable ret{*this};
+    ret.component = Component::REAL;
+    return ret;
+}
+
+Variable Variable::y() const
+{
+    Variable ret{*this};
+    ret.component = Component::IMAG;
+    return ret;
 }
 
 std::string Variable::render() const
 {
+    std::string output;
     if(index)
     {
-        std::string output = name + "[" + vrender(*index) + "]";
+        output = name + "[" + vrender(*index) + "]";
         if(index2D)
             output += "[" + vrender(*index2D) + "]";
-        return output;
     }
-    return name;
+    else
+        output = name;
+
+    if(component == Component::REAL)
+        output += ".x";
+    else if(component == Component::IMAG)
+        output += ".y";
+    return output;
 }
 
 Variable Variable::operator[](const Expression& index) const
@@ -317,23 +327,48 @@
     return Variable(*this, index, index2D);
 }
 
+OptionalExpression::OptionalExpression() {}
+
+OptionalExpression::OptionalExpression(OptionalExpression&& o)
+    : expr(std::move(o.expr))
+{
+}
+OptionalExpression::OptionalExpression(const OptionalExpression& o)
+{
+    if(o.expr)
+        this->expr = std::make_unique<Expression>(*o.expr);
+}
+
+OptionalExpression& OptionalExpression::operator=(OptionalExpression&& o)
+{
+    std::swap(expr, o.expr);
+    return *this;
+}
+
+OptionalExpression& OptionalExpression::operator=(const OptionalExpression& o)
+{
+    if(o.expr)
+        this->expr = std::make_unique<Expression>(*o.expr);
+    return *this;
+}
+
 OptionalExpression::operator bool() const
 {
-    return expr.has_value();
+    return expr != nullptr;
 }
 
 Expression OptionalExpression::operator*() const
 {
-    return std::any_cast<Expression>(expr);
+    return *expr;
 }
 
 OptionalExpression::OptionalExpression(const Expression& expr)
 {
-    this->expr = expr;
+    this->expr = std::make_unique<Expression>(expr);
 }
 OptionalExpression& OptionalExpression::operator=(const Expression& in_expr)
 {
-    this->expr = in_expr;
+    this->expr = std::make_unique<Expression>(in_expr);
     return *this;
 }
 
@@ -346,7 +381,7 @@
         if(separator)
             ret += separator;
         ret += vrender(arg);
-        separator = oper.c_str();
+        separator = oper;
     }
     ret += "}";
     return ret;
@@ -354,20 +389,24 @@
 
 std::string ComplexMultiply::render() const
 {
-    auto a = std::get<Variable>(args[0]);
-    auto b = std::get<Variable>(args[1]);
-    auto r = ComplexLiteral{a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y};
+    auto& a = std::get<Variable>(args[0]);
+    auto& b = std::get<Variable>(args[1]);
+    auto  r = ComplexLiteral{a.x() * b.x() - a.y() * b.y(), a.y() * b.x() + a.x() * b.y()};
     return r.render();
 }
 
 std::string TwiddleMultiply::render() const
 {
-    return ComplexLiteral{a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y}.render();
+    auto& a = vars[0];
+    auto& b = vars[1];
+    return ComplexLiteral{a.x() * b.x() - a.y() * b.y(), a.y() * b.x() + a.x() * b.y()}.render();
 }
 
 std::string TwiddleMultiplyConjugate::render() const
 {
-    return ComplexLiteral{a.x * b.x + a.y * b.y, a.y * b.x - a.x * b.y}.render();
+    auto& a = vars[0];
+    auto& b = vars[1];
+    return ComplexLiteral{a.x() * b.x() + a.y() * b.y(), a.y() * b.x() - a.x() * b.y()}.render();
 }
 
 Parens::Parens(Expression&& inside)
@@ -443,6 +482,11 @@
 {
 }
 
+IntrinsicLoadPlanar::IntrinsicLoadPlanar(const std::vector<Expression>& args)
+    : args(args)
+{
+}
+
 std::string IntrinsicLoad::render() const
 {
     // intrinsic_load(const T* data, unsigned int voffset, unsigned int soffset,
@@ -451,6 +495,15 @@
            + "," + vrender(args[3]) + ")";
 }
 
+std::string IntrinsicLoadPlanar::render() const
+{
+    return "{"
+           "intrinsic_load("
+           + vrender(args[0]) + "," + vrender(args[2]) + "," + vrender(args[3]) + ","
+           + vrender(args[4]) + ")" + "," + "intrinsic_load(" + vrender(args[1]) + ","
+           + vrender(args[2]) + "," + vrender(args[3]) + "," + vrender(args[4]) + ")" + "}";
+}
+
 std::string Declaration::render() const
 {
     std::string s;
@@ -592,7 +645,7 @@
     f += qualifier + " ";
     if(launch_bounds)
         f += "__launch_bounds__(" + std::to_string(launch_bounds) + ") ";
-    f += "void " + name;
+    f += return_type + " " + name;
     f += "(" + arguments.render_decl() + ") {\n";
     f += body.render();
     f += "}\n";
diff -Nru rocfft-5.5.0/library/src/device/generator/generator.h rocfft-5.7.1/library/src/device/generator/generator.h
--- rocfft-5.5.0/library/src/device/generator/generator.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/generator/generator.h	2023-08-09 16:19:51.000000000 +0000
@@ -21,8 +21,8 @@
 #pragma once
 
 #include <algorithm>
-#include <any>
 #include <iostream>
+#include <memory>
 #include <numeric>
 #include <optional>
 #include <string.h>
@@ -77,7 +77,6 @@
 // Expressions
 //
 
-struct ScalarVariable;
 class Variable;
 class Literal;
 class ComplexLiteral;
@@ -111,6 +110,7 @@
 // FFT expressions
 
 class LoadGlobal;
+class LoadGlobalPlanar;
 
 class TwiddleMultiply;
 class TwiddleMultiplyConjugate;
@@ -120,9 +120,9 @@
 class CallExpr;
 
 class IntrinsicLoad;
+class IntrinsicLoadPlanar;
 
-using Expression = std::variant<ScalarVariable,
-                                Variable,
+using Expression = std::variant<Variable,
                                 Literal,
                                 ComplexLiteral,
                                 Add,
@@ -148,22 +148,24 @@
                                 PreDecrement,
                                 Ternary,
                                 LoadGlobal,
+                                LoadGlobalPlanar,
                                 TwiddleMultiply,
                                 TwiddleMultiplyConjugate,
                                 Parens,
                                 CallExpr,
-                                IntrinsicLoad>;
+                                IntrinsicLoad,
+                                IntrinsicLoadPlanar>;
 
 class OptionalExpression
 {
-    std::any expr;
+    std::unique_ptr<Expression> expr;
 
 public:
-    OptionalExpression(){};
-    OptionalExpression(OptionalExpression&&)      = default;
-    OptionalExpression(const OptionalExpression&) = default;
-    OptionalExpression& operator=(OptionalExpression&&) = default;
-    OptionalExpression& operator=(const OptionalExpression&) = default;
+    OptionalExpression();
+    OptionalExpression(OptionalExpression&&);
+    OptionalExpression(const OptionalExpression&);
+    OptionalExpression& operator=(OptionalExpression&&);
+    OptionalExpression& operator=(const OptionalExpression&);
     explicit OptionalExpression(const Expression& expr);
     OptionalExpression& operator=(const Expression& in_expr);
     Expression          operator*() const;
@@ -206,35 +208,13 @@
     }
 };
 
-struct ScalarVariable
-{
-    static const unsigned int precedence = 0;
-    std::string               name, type;
-    Component                 component;
-    OptionalExpression        index;
-
-    ScalarVariable(const std::string& name,
-                   const std::string& type,
-                   Component          component = Component::BOTH)
-        : name(name)
-        , type(type)
-        , component(component){};
-    ScalarVariable(ScalarVariable&&)      = default;
-    ScalarVariable(const ScalarVariable&) = default;
-    ScalarVariable& operator=(const ScalarVariable&) = default;
-    ScalarVariable& operator=(ScalarVariable&&) = default;
-
-    std::string render() const;
-};
-
 class Variable
 {
 public:
     static const unsigned int precedence = 0;
     std::string               name, type;
     bool                      pointer = false, restrict = false;
-    ScalarVariable            x, y;
-    Component                 component;
+    Component                 component = Component::BOTH;
     // index2d + size2d are set if this is a 2D array variable
     OptionalExpression index;
     OptionalExpression index2D;
@@ -255,12 +235,6 @@
              bool restrict,
              const Expression& _size);
 
-    Variable(const ScalarVariable& v)
-        : name(v.name)
-        , type(v.type)
-        , x(v.name + ".x", v.type)
-        , y(v.name + ".y", v.type){};
-
     Variable(const Variable& v);
     Variable(Variable&& v) = default;
     Variable(const Variable& v, const Expression& _index);
@@ -270,8 +244,12 @@
     Variable& operator=(Variable&&) = default;
     Variable  operator[](const Expression& index) const;
     // do a 2D array access
-    Variable       at(const Expression& index, const Expression& index2D) const;
-    ScalarVariable address() const;
+    Variable at(const Expression& index, const Expression& index2D) const;
+    Variable address() const;
+
+    // assuming this is a complex value, access the x, y members
+    Variable x() const;
+    Variable y() const;
 
     std::string render() const;
 };
@@ -384,22 +362,35 @@
     std::vector<Expression> args;
 };
 
+class LoadGlobalPlanar
+{
+public:
+    static const unsigned int precedence = 18;
+    explicit LoadGlobalPlanar(const std::vector<Expression>& args);
+    LoadGlobalPlanar(LoadGlobalPlanar&&)      = default;
+    LoadGlobalPlanar(const LoadGlobalPlanar&) = default;
+    LoadGlobalPlanar& operator=(LoadGlobalPlanar&&) = default;
+    LoadGlobalPlanar& operator=(const LoadGlobalPlanar&) = default;
+
+    std::string render() const;
+
+    std::vector<Expression> args;
+};
+
 class TwiddleMultiply
 {
 public:
     static const unsigned int precedence = 18;
     TwiddleMultiply(const Variable& a, const Variable& b)
-        : a(a)
-        , b(b)
+        : vars({a, b})
     {
     }
     TwiddleMultiply(TwiddleMultiply&&)      = default;
     TwiddleMultiply(const TwiddleMultiply&) = default;
-    TwiddleMultiply& operator=(TwiddleMultiply&&) = default;
-    TwiddleMultiply& operator=(const TwiddleMultiply&) = default;
-    Variable         a;
-    Variable         b;
-    std::string      render() const;
+    TwiddleMultiply&      operator=(TwiddleMultiply&&) = default;
+    TwiddleMultiply&      operator=(const TwiddleMultiply&) = default;
+    std::vector<Variable> vars;
+    std::string           render() const;
 };
 
 class TwiddleMultiplyConjugate
@@ -407,16 +398,14 @@
 public:
     static const unsigned int precedence = 18;
     TwiddleMultiplyConjugate(const Variable& a, const Variable& b)
-        : a(a)
-        , b(b)
+        : vars({a, b})
     {
     }
     TwiddleMultiplyConjugate(TwiddleMultiplyConjugate&&)      = default;
     TwiddleMultiplyConjugate(const TwiddleMultiplyConjugate&) = default;
     TwiddleMultiplyConjugate& operator=(TwiddleMultiplyConjugate&&) = default;
     TwiddleMultiplyConjugate& operator=(const TwiddleMultiplyConjugate&) = default;
-    Variable                  a;
-    Variable                  b;
+    std::vector<Variable>     vars;
     std::string               render() const;
 };
 
@@ -452,10 +441,25 @@
     std::string             render() const;
 };
 
+class IntrinsicLoadPlanar
+{
+public:
+    static const unsigned int precedence = 18;
+    explicit IntrinsicLoadPlanar(const std::vector<Expression>& args);
+    IntrinsicLoadPlanar(IntrinsicLoadPlanar&&)      = default;
+    IntrinsicLoadPlanar(const IntrinsicLoadPlanar&) = default;
+    IntrinsicLoadPlanar& operator=(IntrinsicLoadPlanar&&) = default;
+    IntrinsicLoadPlanar& operator=(const IntrinsicLoadPlanar&) = default;
+
+    // data, voffset, soffset, rw
+    std::vector<Expression> args;
+    std::string             render() const;
+};
+
 #define MAKE_OPER(NAME, OPER, PRECEDENCE)                           \
     class NAME                                                      \
     {                                                               \
-        std::string oper{OPER};                                     \
+        static constexpr const char* oper = {OPER};                 \
                                                                     \
     public:                                                         \
         static const unsigned int precedence = PRECEDENCE;          \
@@ -604,8 +608,11 @@
 // Expressions, for some classes.
 
 class Assign;
+class ReturnExpr;
 class Call;
 class CallbackDeclaration;
+class CallbackLoadDeclaration;
+class CallbackStoreDeclaration;
 class Declaration;
 class LDSDeclaration;
 class For;
@@ -618,6 +625,7 @@
 class StatementList;
 class Butterfly;
 class IntrinsicStore;
+class IntrinsicStorePlanar;
 class IntrinsicLoadToDest;
 
 struct LineBreak
@@ -674,8 +682,11 @@
 };
 
 using Statement = std::variant<Assign,
+                               ReturnExpr,
                                Call,
                                CallbackDeclaration,
+                               CallbackLoadDeclaration,
+                               CallbackStoreDeclaration,
                                CommentLines,
                                Declaration,
                                LDSDeclaration,
@@ -692,6 +703,7 @@
                                SyncThreads,
                                Butterfly,
                                IntrinsicStore,
+                               IntrinsicStorePlanar,
                                IntrinsicLoadToDest>;
 
 class Assign
@@ -792,6 +804,49 @@
     }
 };
 
+class CallbackLoadDeclaration
+{
+public:
+    CallbackLoadDeclaration(const std::string& scalar_type, const std::string& cbtype)
+        : scalar_type(scalar_type)
+        , cbtype(cbtype){};
+    std::string scalar_type;
+    std::string cbtype;
+    std::string render() const
+    {
+        return "auto load_cb = get_load_cb<" + scalar_type + ", " + cbtype + ">(load_cb_fn);";
+    }
+};
+
+class CallbackStoreDeclaration
+{
+public:
+    CallbackStoreDeclaration(const std::string& scalar_type, const std::string& cbtype)
+        : scalar_type(scalar_type)
+        , cbtype(cbtype){};
+    std::string scalar_type;
+    std::string cbtype;
+    std::string render() const
+    {
+        return "auto store_cb = get_store_cb<" + scalar_type + ", " + cbtype + ">(store_cb_fn);";
+    }
+};
+
+class ReturnExpr
+{
+public:
+    ReturnExpr(const Expression& expr)
+        : expr(expr)
+    {
+    }
+
+    Expression expr;
+
+    std::string render() const
+    {
+        return "return " + vrender(expr) + ";";
+    }
+};
 class Call
 {
 public:
@@ -919,12 +974,12 @@
     {
         // Output two assignments
         return Assign{realPtr[index],
-                      scale_factor ? Expression{value.x * scale_factor.value()}
-                                   : Expression{value.x}}
+                      scale_factor ? Expression{value.x() * scale_factor.value()}
+                                   : Expression{value.x()}}
                    .render()
                + Assign{imagPtr[index],
-                        scale_factor ? Expression{value.y * scale_factor.value()}
-                                     : Expression{value.y}}
+                        scale_factor ? Expression{value.y() * scale_factor.value()}
+                                     : Expression{value.y()}}
                      .render();
     }
 
@@ -979,6 +1034,52 @@
     std::optional<Expression> scale_factor;
 };
 
+class IntrinsicStorePlanar
+{
+public:
+    IntrinsicStorePlanar(const Expression&                ptrre,
+                         const Expression&                ptrim,
+                         const Expression&                voffset,
+                         const Expression&                soffset,
+                         const Expression&                value,
+                         const Expression&                rw_flag,
+                         const std::optional<Expression>& scale_factor)
+        : ptrre{ptrre}
+        , ptrim{ptrim}
+        , voffset{voffset}
+        , soffset{soffset}
+        , value{value}
+        , rw_flag{rw_flag}
+        , scale_factor{scale_factor}
+    {
+    }
+    std::string render() const
+    {
+        return "store_intrinsic(" + vrender(ptrre) + "," + vrender(voffset) + "," + vrender(soffset)
+               + ","
+               + Literal{"real_type_t<scalar_type>("
+                         + vrender(scale_factor ? (value * scale_factor.value()) : value) + ".x)"}
+                     .render()
+               + "," + vrender(rw_flag)
+               + ");"
+                 "\n"
+                 "store_intrinsic("
+               + vrender(ptrim) + "," + vrender(voffset) + "," + vrender(soffset) + ","
+               + Literal{"real_type_t<scalar_type>("
+                         + vrender(scale_factor ? (value * scale_factor.value()) : value) + ".y)"}
+                     .render()
+               + "," + vrender(rw_flag) + ");";
+    }
+
+    Expression                ptrre;
+    Expression                ptrim;
+    Expression                voffset;
+    Expression                soffset;
+    Expression                value;
+    Expression                rw_flag;
+    std::optional<Expression> scale_factor;
+};
+
 class IntrinsicLoadToDest
 {
 public:
@@ -1049,6 +1150,7 @@
     ArgumentList  arguments;
     TemplateList  templates;
     std::string   qualifier;
+    std::string   return_type   = "void";
     unsigned int  launch_bounds = 0;
 
     explicit Function(const std::string& name)
@@ -1078,7 +1180,6 @@
         return visit_##CLS(x);          \
     }
 
-    MAKE_VISITOR_OPERATOR(Expression, ScalarVariable);
     MAKE_VISITOR_OPERATOR(Expression, Variable);
     MAKE_VISITOR_OPERATOR(Expression, Literal);
     MAKE_VISITOR_OPERATOR(Expression, ComplexLiteral);
@@ -1104,16 +1205,21 @@
     MAKE_VISITOR_OPERATOR(Expression, PreDecrement);
     MAKE_VISITOR_OPERATOR(Expression, Ternary);
     MAKE_VISITOR_OPERATOR(Expression, LoadGlobal);
+    MAKE_VISITOR_OPERATOR(Expression, LoadGlobalPlanar);
     MAKE_VISITOR_OPERATOR(Expression, ComplexMultiply);
     MAKE_VISITOR_OPERATOR(Expression, TwiddleMultiply);
     MAKE_VISITOR_OPERATOR(Expression, TwiddleMultiplyConjugate);
     MAKE_VISITOR_OPERATOR(Expression, Parens);
     MAKE_VISITOR_OPERATOR(Expression, CallExpr);
     MAKE_VISITOR_OPERATOR(Expression, IntrinsicLoad);
+    MAKE_VISITOR_OPERATOR(Expression, IntrinsicLoadPlanar);
 
     MAKE_VISITOR_OPERATOR(StatementList, Assign);
+    MAKE_VISITOR_OPERATOR(StatementList, ReturnExpr);
     MAKE_VISITOR_OPERATOR(StatementList, Call);
     MAKE_VISITOR_OPERATOR(StatementList, CallbackDeclaration);
+    MAKE_VISITOR_OPERATOR(StatementList, CallbackLoadDeclaration);
+    MAKE_VISITOR_OPERATOR(StatementList, CallbackStoreDeclaration);
     MAKE_VISITOR_OPERATOR(StatementList, CommentLines);
     MAKE_VISITOR_OPERATOR(StatementList, Declaration);
     MAKE_VISITOR_OPERATOR(StatementList, LDSDeclaration);
@@ -1130,6 +1236,7 @@
     MAKE_VISITOR_OPERATOR(StatementList, SyncThreads);
     MAKE_VISITOR_OPERATOR(StatementList, Butterfly);
     MAKE_VISITOR_OPERATOR(StatementList, IntrinsicStore);
+    MAKE_VISITOR_OPERATOR(StatementList, IntrinsicStorePlanar);
     MAKE_VISITOR_OPERATOR(StatementList, IntrinsicLoadToDest);
 
     MAKE_VISITOR_OPERATOR(ArgumentList, ArgumentList);
@@ -1207,19 +1314,23 @@
     MAKE_EXPR_VISIT(PreDecrement);
 
     MAKE_EXPR_VISIT(LoadGlobal);
+    MAKE_EXPR_VISIT(LoadGlobalPlanar);
 
     MAKE_TRIVIAL_VISIT(Expression, ComplexMultiply);
     MAKE_TRIVIAL_VISIT(Expression, TwiddleMultiply);
     MAKE_TRIVIAL_VISIT(Expression, TwiddleMultiplyConjugate);
 
     MAKE_EXPR_VISIT(IntrinsicLoad);
+    MAKE_EXPR_VISIT(IntrinsicLoadPlanar);
     MAKE_EXPR_VISIT(Parens);
 
     MAKE_EXPR_VISIT(Ternary);
     MAKE_EXPR_VISIT(ComplexLiteral)
 
-    MAKE_TRIVIAL_VISIT(Expression, ScalarVariable)
+    MAKE_TRIVIAL_STATEMENT_VISIT(ReturnExpr)
     MAKE_TRIVIAL_STATEMENT_VISIT(CallbackDeclaration)
+    MAKE_TRIVIAL_STATEMENT_VISIT(CallbackLoadDeclaration)
+    MAKE_TRIVIAL_STATEMENT_VISIT(CallbackStoreDeclaration)
     MAKE_TRIVIAL_STATEMENT_VISIT(LDSDeclaration)
 
     MAKE_TRIVIAL_VISIT(Expression, Literal)
@@ -1342,6 +1453,21 @@
         return StatementList{IntrinsicStore(ptr, voffset, soffset, value, rw_flag)};
     }
 
+    virtual StatementList visit_IntrinsicStorePlanar(const IntrinsicStorePlanar& x)
+    {
+        auto                      ptrre   = std::visit(*this, x.ptrre);
+        auto                      ptrim   = std::visit(*this, x.ptrim);
+        auto                      voffset = std::visit(*this, x.voffset);
+        auto                      soffset = std::visit(*this, x.soffset);
+        auto                      value   = std::visit(*this, x.value);
+        auto                      rw_flag = std::visit(*this, x.rw_flag);
+        std::optional<Expression> scale_factor;
+        if(x.scale_factor)
+            scale_factor = std::visit(*this, x.scale_factor.value());
+        return StatementList{
+            IntrinsicStorePlanar(ptrre, ptrim, voffset, soffset, value, rw_flag, scale_factor)};
+    }
+
     virtual StatementList visit_StoreGlobalPlanar(const StoreGlobalPlanar& x)
     {
         auto                      realPtr = std::get<Variable>(visit_Variable(x.realPtr));
@@ -1454,7 +1580,7 @@
                 auto im = ptr;
                 im.name = imname;
 
-                stmts += Assign{x.lhs, ComplexLiteral{re[idx], im[idx]}, x.oper};
+                stmts += Assign{x.lhs, LoadGlobalPlanar({re, im, idx}), x.oper};
                 return stmts;
             }
         }
@@ -1474,10 +1600,8 @@
                 auto im = ptr;
                 im.name = imname;
 
-                stmts += Assign{x.lhs,
-                                ComplexLiteral{IntrinsicLoad({re, voffset, soffset, rw_flag}),
-                                               IntrinsicLoad({im, voffset, soffset, rw_flag})},
-                                x.oper};
+                stmts += Assign{
+                    x.lhs, IntrinsicLoadPlanar({re, im, voffset, soffset, rw_flag}), x.oper};
                 return stmts;
             }
         }
@@ -1516,26 +1640,9 @@
             auto im = var;
             im.name = imname;
 
-            StatementList stmts;
-            stmts += Call{
-                "store_intrinsic",
-                {re,
-                 x.voffset,
-                 x.soffset,
-                 Literal{"real_type_t<scalar_type>("
-                         + vrender(x.scale_factor ? (x.scale_factor.value() * x.value) : x.value)
-                         + ".x)"},
-                 x.rw_flag}};
-            stmts += Call{
-                "store_intrinsic",
-                {im,
-                 x.voffset,
-                 x.soffset,
-                 Literal{"real_type_t<scalar_type>("
-                         + vrender(x.scale_factor ? (x.scale_factor.value() * x.value) : x.value)
-                         + ".y)"},
-                 x.rw_flag}};
-            return stmts;
+            auto value = std::get<Variable>(x.value);
+            return {IntrinsicStorePlanar{
+                re, im, x.voffset, x.soffset, value, x.rw_flag, x.scale_factor}};
         }
         return StatementList{x};
     }
@@ -1726,7 +1833,7 @@
 {
     Expression visit_TwiddleMultiply(const TwiddleMultiply& x) override
     {
-        return TwiddleMultiplyConjugate{x.a, x.b};
+        return TwiddleMultiplyConjugate{x.vars[0], x.vars[1]};
     }
 
     StatementList visit_Butterfly(const Butterfly& x) override
diff -Nru rocfft-5.5.0/library/src/device/generator/rocfft_butterfly_template.h rocfft-5.7.1/library/src/device/generator/rocfft_butterfly_template.h
--- rocfft-5.5.0/library/src/device/generator/rocfft_butterfly_template.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/generator/rocfft_butterfly_template.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
  ******************************************************************************/
 
 #ifndef ROCFFT_BUTTERFLY_TEMPLATE_H
@@ -17,10 +17,10 @@
     {
         i += 1;
         j      = u & ((1 << Base) - 1);
-        result = lib_make_vector2<T>((result.x * twiddles[(1 << Base) * i + j].x
-                                      - result.y * twiddles[(1 << Base) * i + j].y),
-                                     (result.y * twiddles[(1 << Base) * i + j].x
-                                      + result.x * twiddles[(1 << Base) * i + j].y));
+        result = T((result.x * twiddles[(1 << Base) * i + j].x
+                    - result.y * twiddles[(1 << Base) * i + j].y),
+                   (result.y * twiddles[(1 << Base) * i + j].x
+                    + result.x * twiddles[(1 << Base) * i + j].y));
     }
     // static compiled
     if(Steps >= 3)
@@ -29,10 +29,10 @@
 
         i += 1;
         j      = u & ((1 << Base) - 1);
-        result = lib_make_vector2<T>((result.x * twiddles[(1 << Base) * i + j].x
-                                      - result.y * twiddles[(1 << Base) * i + j].y),
-                                     (result.y * twiddles[(1 << Base) * i + j].x
-                                      + result.x * twiddles[(1 << Base) * i + j].y));
+        result = T((result.x * twiddles[(1 << Base) * i + j].x
+                    - result.y * twiddles[(1 << Base) * i + j].y),
+                   (result.y * twiddles[(1 << Base) * i + j].x
+                    + result.x * twiddles[(1 << Base) * i + j].y));
     }
     // we probably don't have 4-steps for large-twiddle
     // if(Steps >= 4){...}
@@ -48,13 +48,13 @@
 
     u >>= 8;
     j      = u & 255;
-    result = lib_make_vector2<T>((result.x * twiddles[256 + j].x - result.y * twiddles[256 + j].y),
-                                 (result.y * twiddles[256 + j].x + result.x * twiddles[256 + j].y));
+    result = T((result.x * twiddles[256 + j].x - result.y * twiddles[256 + j].y),
+               (result.y * twiddles[256 + j].x + result.x * twiddles[256 + j].y));
 
     u >>= 8;
     j      = u & 255;
-    result = lib_make_vector2<T>((result.x * twiddles[512 + j].x - result.y * twiddles[512 + j].y),
-                                 (result.y * twiddles[512 + j].x + result.x * twiddles[512 + j].y));
+    result = T((result.x * twiddles[512 + j].x - result.y * twiddles[512 + j].y),
+               (result.y * twiddles[512 + j].x + result.x * twiddles[512 + j].y));
     return result;
 }
 
@@ -132,7 +132,7 @@
     (*R2) = (*R0) - (*R2);
     (*R0) = 2.0 * (*R0) - (*R2);
 
-    (*R3) = (*R1) + lib_make_vector2<T>(-(*R3).y, (*R3).x);
+    (*R3) = (*R1) + T(-(*R3).y, (*R3).x);
     (*R1) = 2.0 * (*R1) - (*R3);
 
     res   = (*R1);
@@ -153,7 +153,7 @@
 
     (*R2) = (*R0) - (*R2);
     (*R0) = 2.0 * (*R0) - (*R2);
-    (*R3) = (*R1) + lib_make_vector2<T>((*R3).y, -(*R3).x);
+    (*R3) = (*R1) + T((*R3).y, -(*R3).x);
     (*R1) = 2.0 * (*R1) - (*R3);
 
     res   = (*R1);
@@ -498,21 +498,21 @@
 
     (*R2) = (*R0) - (*R2);
     (*R0) = 2.0 * (*R0) - (*R2);
-    (*R3) = (*R1) + lib_make_vector2<T>(-(*R3).y, (*R3).x);
+    (*R3) = (*R1) + T(-(*R3).y, (*R3).x);
     (*R1) = 2.0 * (*R1) - (*R3);
     (*R6) = (*R4) - (*R6);
     (*R4) = 2.0 * (*R4) - (*R6);
-    (*R7) = (*R5) + lib_make_vector2<T>(-(*R7).y, (*R7).x);
+    (*R7) = (*R5) + T(-(*R7).y, (*R7).x);
 
     (*R5) = 2.0 * (*R5) - (*R7);
 
     (*R4) = (*R0) - (*R4);
     (*R0) = 2.0 * (*R0) - (*R4);
-    (*R5) = ((*R1) - C8Q * (*R5)) - C8Q * lib_make_vector2<T>((*R5).y, -(*R5).x);
+    (*R5) = ((*R1) - C8Q * (*R5)) - C8Q * T((*R5).y, -(*R5).x);
     (*R1) = 2.0 * (*R1) - (*R5);
-    (*R6) = (*R2) + lib_make_vector2<T>(-(*R6).y, (*R6).x);
+    (*R6) = (*R2) + T(-(*R6).y, (*R6).x);
     (*R2) = 2.0 * (*R2) - (*R6);
-    (*R7) = ((*R3) + C8Q * (*R7)) - C8Q * lib_make_vector2<T>((*R7).y, -(*R7).x);
+    (*R7) = ((*R3) + C8Q * (*R7)) - C8Q * T((*R7).y, -(*R7).x);
     (*R3) = 2.0 * (*R3) - (*R7);
 
     res   = (*R1);
@@ -540,20 +540,20 @@
 
     (*R2) = (*R0) - (*R2);
     (*R0) = 2.0 * (*R0) - (*R2);
-    (*R3) = (*R1) + lib_make_vector2<T>((*R3).y, -(*R3).x);
+    (*R3) = (*R1) + T((*R3).y, -(*R3).x);
     (*R1) = 2.0 * (*R1) - (*R3);
     (*R6) = (*R4) - (*R6);
     (*R4) = 2.0 * (*R4) - (*R6);
-    (*R7) = (*R5) + lib_make_vector2<T>((*R7).y, -(*R7).x);
+    (*R7) = (*R5) + T((*R7).y, -(*R7).x);
     (*R5) = 2.0 * (*R5) - (*R7);
 
     (*R4) = (*R0) - (*R4);
     (*R0) = 2.0 * (*R0) - (*R4);
-    (*R5) = ((*R1) - C8Q * (*R5)) + C8Q * lib_make_vector2<T>((*R5).y, -(*R5).x);
+    (*R5) = ((*R1) - C8Q * (*R5)) + C8Q * T((*R5).y, -(*R5).x);
     (*R1) = 2.0 * (*R1) - (*R5);
-    (*R6) = (*R2) + lib_make_vector2<T>((*R6).y, -(*R6).x);
+    (*R6) = (*R2) + T((*R6).y, -(*R6).x);
     (*R2) = 2.0 * (*R2) - (*R6);
-    (*R7) = ((*R3) + C8Q * (*R7)) + C8Q * lib_make_vector2<T>((*R7).y, -(*R7).x);
+    (*R7) = ((*R3) + C8Q * (*R7)) + C8Q * T((*R7).y, -(*R7).x);
     (*R3) = 2.0 * (*R3) - (*R7);
 
     res   = (*R1);
@@ -582,25 +582,24 @@
     // borrow R8 as temp
     (*R8) = (C9QB * p0) + (C9QD * p1) + (p2) + (C9QH * ((*R4) - (*R5)));
     (*R1) = ((*R0) + (C9QA * v0) + (C9QC * v1) - (C9QE * v2) - (C9QG * ((*R4) + (*R5))))
-            + lib_make_vector2<T>((*R8).y, -(*R8).x);
-    (*R8) = (*R1) + 2.0 * lib_make_vector2<T>(-(*R8).y, (*R8).x);
+            + T((*R8).y, -(*R8).x);
+    (*R8) = (*R1) + 2.0 * T(-(*R8).y, (*R8).x);
     // borrow R7 as temp
     (*R7) = -(C9QB * ((*R4) - (*R5))) + (C9QD * p0) - (p2) + (C9QH * p1);
     (*R2) = ((*R0) + (C9QA * ((*R4) + (*R5))) + (C9QC * v0) - (C9QE * v2) - (C9QG * v1))
-            + lib_make_vector2<T>((*R7).y, -(*R7).x);
-    (*R7) = (*R2) + 2.0 * lib_make_vector2<T>(-(*R7).y, (*R7).x);
+            + T((*R7).y, -(*R7).x);
+    (*R7) = (*R2) + 2.0 * T(-(*R7).y, (*R7).x);
     // borrow R6 temp
     (*R6) = C9QF * (p0 + ((*R4) - (*R5)) - p1);
-    (*R3) = ((*R0) + v2 - C9QE * (v0 + v1 + ((*R4) + (*R5))))
-            + lib_make_vector2<T>((*R6).y, -(*R6).x);
-    (*R6) = (*R3) + 2.0 * lib_make_vector2<T>(-(*R6).y, (*R6).x);
+    (*R3) = ((*R0) + v2 - C9QE * (v0 + v1 + ((*R4) + (*R5)))) + T((*R6).y, -(*R6).x);
+    (*R6) = (*R3) + 2.0 * T(-(*R6).y, (*R6).x);
     // borrow p0 as temp
     p0 = -(C9QB * p1) - (C9QD * ((*R4) - (*R5))) + (p2) + (C9QH * p0);
     p1 = (*R0);
     (*R0) += (v0 + v1 + v2 + (*R4) + (*R5));
     (*R4) = (p1 + (C9QA * v1) + (C9QC * ((*R4) + (*R5))) - (C9QE * v2) - (C9QG * v0))
-            + lib_make_vector2<T>(p0.y, -p0.x);
-    (*R5) = (*R4) + 2.0 * lib_make_vector2<T>(-p0.y, p0.x);
+            + T(p0.y, -p0.x);
+    (*R5) = (*R4) + 2.0 * T(-p0.y, p0.x);
 }
 
 template <typename T>
@@ -619,25 +618,24 @@
     // borrow R8 as temp
     (*R8) = (C9QB * p0) + (C9QD * p1) + (p2) + (C9QH * ((*R4) - (*R5)));
     (*R1) = ((*R0) + (C9QA * v0) + (C9QC * v1) - (C9QE * v2) - (C9QG * ((*R4) + (*R5))))
-            + lib_make_vector2<T>(-(*R8).y, (*R8).x);
-    (*R8) = (*R1) + 2.0 * lib_make_vector2<T>((*R8).y, -(*R8).x);
+            + T(-(*R8).y, (*R8).x);
+    (*R8) = (*R1) + 2.0 * T((*R8).y, -(*R8).x);
     // borrow R7 as temp
     (*R7) = -(C9QB * ((*R4) - (*R5))) + (C9QD * p0) - (p2) + (C9QH * p1);
     (*R2) = ((*R0) + (C9QA * ((*R4) + (*R5))) + (C9QC * v0) - (C9QE * v2) - (C9QG * v1))
-            + lib_make_vector2<T>(-(*R7).y, (*R7).x);
-    (*R7) = (*R2) + 2.0 * lib_make_vector2<T>((*R7).y, -(*R7).x);
+            + T(-(*R7).y, (*R7).x);
+    (*R7) = (*R2) + 2.0 * T((*R7).y, -(*R7).x);
     // borrow R6 temp
     (*R6) = C9QF * (p0 + ((*R4) - (*R5)) - p1);
-    (*R3) = ((*R0) + v2 - C9QE * (v0 + v1 + ((*R4) + (*R5))))
-            + lib_make_vector2<T>(-(*R6).y, (*R6).x);
-    (*R6) = (*R3) + 2.0 * lib_make_vector2<T>((*R6).y, -(*R6).x);
+    (*R3) = ((*R0) + v2 - C9QE * (v0 + v1 + ((*R4) + (*R5)))) + T(-(*R6).y, (*R6).x);
+    (*R6) = (*R3) + 2.0 * T((*R6).y, -(*R6).x);
     // borrow p0 as temp
     p0 = -(C9QB * p1) - (C9QD * ((*R4) - (*R5))) + (p2) + (C9QH * p0);
     p1 = (*R0);
     (*R0) += (v0 + v1 + v2 + (*R4) + (*R5));
     (*R4) = (p1 + (C9QA * v1) + (C9QC * ((*R4) + (*R5))) - (C9QE * v2) - (C9QG * v0))
-            + lib_make_vector2<T>(-p0.y, p0.x);
-    (*R5) = (*R4) + 2.0 * lib_make_vector2<T>(p0.y, -p0.x);
+            + T(-p0.y, p0.x);
+    (*R5) = (*R4) + 2.0 * T(p0.y, -p0.x);
 }
 
 template <typename T>
@@ -824,57 +822,57 @@
 
     (*R2)  = (*R0) - (*R2);
     (*R0)  = 2.0 * (*R0) - (*R2);
-    (*R3)  = (*R1) + lib_make_vector2<T>(-(*R3).y, (*R3).x);
+    (*R3)  = (*R1) + T(-(*R3).y, (*R3).x);
     (*R1)  = 2.0 * (*R1) - (*R3);
     (*R6)  = (*R4) - (*R6);
     (*R4)  = 2.0 * (*R4) - (*R6);
-    (*R7)  = (*R5) + lib_make_vector2<T>(-(*R7).y, (*R7).x);
+    (*R7)  = (*R5) + T(-(*R7).y, (*R7).x);
     (*R5)  = 2.0 * (*R5) - (*R7);
     (*R10) = (*R8) - (*R10);
     (*R8)  = 2.0 * (*R8) - (*R10);
-    (*R11) = (*R9) + lib_make_vector2<T>(-(*R11).y, (*R11).x);
+    (*R11) = (*R9) + T(-(*R11).y, (*R11).x);
     (*R9)  = 2.0 * (*R9) - (*R11);
     (*R14) = (*R12) - (*R14);
     (*R12) = 2.0 * (*R12) - (*R14);
-    (*R15) = (*R13) + lib_make_vector2<T>(-(*R15).y, (*R15).x);
+    (*R15) = (*R13) + T(-(*R15).y, (*R15).x);
     (*R13) = 2.0 * (*R13) - (*R15);
 
     (*R4)  = (*R0) - (*R4);
     (*R0)  = 2.0 * (*R0) - (*R4);
-    (*R5)  = ((*R1) - C8Q * (*R5)) - C8Q * lib_make_vector2<T>((*R5).y, -(*R5).x);
+    (*R5)  = ((*R1) - C8Q * (*R5)) - C8Q * T((*R5).y, -(*R5).x);
     (*R1)  = 2.0 * (*R1) - (*R5);
-    (*R6)  = (*R2) + lib_make_vector2<T>(-(*R6).y, (*R6).x);
+    (*R6)  = (*R2) + T(-(*R6).y, (*R6).x);
     (*R2)  = 2.0 * (*R2) - (*R6);
-    (*R7)  = ((*R3) + C8Q * (*R7)) - C8Q * lib_make_vector2<T>((*R7).y, -(*R7).x);
+    (*R7)  = ((*R3) + C8Q * (*R7)) - C8Q * T((*R7).y, -(*R7).x);
     (*R3)  = 2.0 * (*R3) - (*R7);
     (*R12) = (*R8) - (*R12);
     (*R8)  = 2.0 * (*R8) - (*R12);
-    (*R13) = ((*R9) - C8Q * (*R13)) - C8Q * lib_make_vector2<T>((*R13).y, -(*R13).x);
+    (*R13) = ((*R9) - C8Q * (*R13)) - C8Q * T((*R13).y, -(*R13).x);
     (*R9)  = 2.0 * (*R9) - (*R13);
-    (*R14) = (*R10) + lib_make_vector2<T>(-(*R14).y, (*R14).x);
+    (*R14) = (*R10) + T(-(*R14).y, (*R14).x);
     (*R10) = 2.0 * (*R10) - (*R14);
-    (*R15) = ((*R11) + C8Q * (*R15)) - C8Q * lib_make_vector2<T>((*R15).y, -(*R15).x);
+    (*R15) = ((*R11) + C8Q * (*R15)) - C8Q * T((*R15).y, -(*R15).x);
     (*R11) = 2.0 * (*R11) - (*R15);
 
     (*R8) = (*R0) - (*R8);
     (*R0) = 2.0 * (*R0) - (*R8);
-    (*R9) = ((*R1) - C16A * (*R9)) - C16B * lib_make_vector2<T>((*R9).y, -(*R9).x);
+    (*R9) = ((*R1) - C16A * (*R9)) - C16B * T((*R9).y, -(*R9).x);
     res   = (*R8);
     (*R1) = 2.0 * (*R1) - (*R9);
 
-    (*R10) = ((*R2) - C8Q * (*R10)) - C8Q * lib_make_vector2<T>((*R10).y, -(*R10).x);
+    (*R10) = ((*R2) - C8Q * (*R10)) - C8Q * T((*R10).y, -(*R10).x);
     (*R2)  = 2.0 * (*R2) - (*R10);
-    (*R11) = ((*R3) - C16B * (*R11)) - C16A * lib_make_vector2<T>((*R11).y, -(*R11).x);
+    (*R11) = ((*R3) - C16B * (*R11)) - C16A * T((*R11).y, -(*R11).x);
     (*R3)  = 2.0 * (*R3) - (*R11);
 
-    (*R12) = (*R4) + lib_make_vector2<T>(-(*R12).y, (*R12).x);
+    (*R12) = (*R4) + T(-(*R12).y, (*R12).x);
     (*R4)  = 2.0 * (*R4) - (*R12);
-    (*R13) = ((*R5) + C16B * (*R13)) - C16A * lib_make_vector2<T>((*R13).y, -(*R13).x);
+    (*R13) = ((*R5) + C16B * (*R13)) - C16A * T((*R13).y, -(*R13).x);
     (*R5)  = 2.0 * (*R5) - (*R13);
 
-    (*R14) = ((*R6) + C8Q * (*R14)) - C8Q * lib_make_vector2<T>((*R14).y, -(*R14).x);
+    (*R14) = ((*R6) + C8Q * (*R14)) - C8Q * T((*R14).y, -(*R14).x);
     (*R6)  = 2.0 * (*R6) - (*R14);
-    (*R15) = ((*R7) + C16A * (*R15)) - C16B * lib_make_vector2<T>((*R15).y, -(*R15).x);
+    (*R15) = ((*R7) + C16A * (*R15)) - C16B * T((*R15).y, -(*R15).x);
     (*R7)  = 2.0 * (*R7) - (*R15);
 
     res    = (*R1);
@@ -937,53 +935,53 @@
 
     (*R2)  = (*R0) - (*R2);
     (*R0)  = 2.0 * (*R0) - (*R2);
-    (*R3)  = (*R1) + lib_make_vector2<T>((*R3).y, -(*R3).x);
+    (*R3)  = (*R1) + T((*R3).y, -(*R3).x);
     (*R1)  = 2.0 * (*R1) - (*R3);
     (*R6)  = (*R4) - (*R6);
     (*R4)  = 2.0 * (*R4) - (*R6);
-    (*R7)  = (*R5) + lib_make_vector2<T>((*R7).y, -(*R7).x);
+    (*R7)  = (*R5) + T((*R7).y, -(*R7).x);
     (*R5)  = 2.0 * (*R5) - (*R7);
     (*R10) = (*R8) - (*R10);
     (*R8)  = 2.0 * (*R8) - (*R10);
-    (*R11) = (*R9) + lib_make_vector2<T>((*R11).y, -(*R11).x);
+    (*R11) = (*R9) + T((*R11).y, -(*R11).x);
     (*R9)  = 2.0 * (*R9) - (*R11);
     (*R14) = (*R12) - (*R14);
     (*R12) = 2.0 * (*R12) - (*R14);
-    (*R15) = (*R13) + lib_make_vector2<T>((*R15).y, -(*R15).x);
+    (*R15) = (*R13) + T((*R15).y, -(*R15).x);
     (*R13) = 2.0 * (*R13) - (*R15);
 
     (*R4)  = (*R0) - (*R4);
     (*R0)  = 2.0 * (*R0) - (*R4);
-    (*R5)  = ((*R1) - C8Q * (*R5)) + C8Q * lib_make_vector2<T>((*R5).y, -(*R5).x);
+    (*R5)  = ((*R1) - C8Q * (*R5)) + C8Q * T((*R5).y, -(*R5).x);
     (*R1)  = 2.0 * (*R1) - (*R5);
-    (*R6)  = (*R2) + lib_make_vector2<T>((*R6).y, -(*R6).x);
+    (*R6)  = (*R2) + T((*R6).y, -(*R6).x);
     (*R2)  = 2.0 * (*R2) - (*R6);
-    (*R7)  = ((*R3) + C8Q * (*R7)) + C8Q * lib_make_vector2<T>((*R7).y, -(*R7).x);
+    (*R7)  = ((*R3) + C8Q * (*R7)) + C8Q * T((*R7).y, -(*R7).x);
     (*R3)  = 2.0 * (*R3) - (*R7);
     (*R12) = (*R8) - (*R12);
     (*R8)  = 2.0 * (*R8) - (*R12);
-    (*R13) = ((*R9) - C8Q * (*R13)) + C8Q * lib_make_vector2<T>((*R13).y, -(*R13).x);
+    (*R13) = ((*R9) - C8Q * (*R13)) + C8Q * T((*R13).y, -(*R13).x);
     (*R9)  = 2.0 * (*R9) - (*R13);
-    (*R14) = (*R10) + lib_make_vector2<T>((*R14).y, -(*R14).x);
+    (*R14) = (*R10) + T((*R14).y, -(*R14).x);
     (*R10) = 2.0 * (*R10) - (*R14);
-    (*R15) = ((*R11) + C8Q * (*R15)) + C8Q * lib_make_vector2<T>((*R15).y, -(*R15).x);
+    (*R15) = ((*R11) + C8Q * (*R15)) + C8Q * T((*R15).y, -(*R15).x);
     (*R11) = 2.0 * (*R11) - (*R15);
 
     (*R8)  = (*R0) - (*R8);
     (*R0)  = 2.0 * (*R0) - (*R8);
-    (*R9)  = ((*R1) - C16A * (*R9)) + C16B * lib_make_vector2<T>((*R9).y, -(*R9).x);
+    (*R9)  = ((*R1) - C16A * (*R9)) + C16B * T((*R9).y, -(*R9).x);
     (*R1)  = 2.0 * (*R1) - (*R9);
-    (*R10) = ((*R2) - C8Q * (*R10)) + C8Q * lib_make_vector2<T>((*R10).y, -(*R10).x);
+    (*R10) = ((*R2) - C8Q * (*R10)) + C8Q * T((*R10).y, -(*R10).x);
     (*R2)  = 2.0 * (*R2) - (*R10);
-    (*R11) = ((*R3) - C16B * (*R11)) + C16A * lib_make_vector2<T>((*R11).y, -(*R11).x);
+    (*R11) = ((*R3) - C16B * (*R11)) + C16A * T((*R11).y, -(*R11).x);
     (*R3)  = 2.0 * (*R3) - (*R11);
-    (*R12) = (*R4) + lib_make_vector2<T>((*R12).y, -(*R12).x);
+    (*R12) = (*R4) + T((*R12).y, -(*R12).x);
     (*R4)  = 2.0 * (*R4) - (*R12);
-    (*R13) = ((*R5) + C16B * (*R13)) + C16A * lib_make_vector2<T>((*R13).y, -(*R13).x);
+    (*R13) = ((*R5) + C16B * (*R13)) + C16A * T((*R13).y, -(*R13).x);
     (*R5)  = 2.0 * (*R5) - (*R13);
-    (*R14) = ((*R6) + C8Q * (*R14)) + C8Q * lib_make_vector2<T>((*R14).y, -(*R14).x);
+    (*R14) = ((*R6) + C8Q * (*R14)) + C8Q * T((*R14).y, -(*R14).x);
     (*R6)  = 2.0 * (*R6) - (*R14);
-    (*R15) = ((*R7) + C16A * (*R15)) + C16B * lib_make_vector2<T>((*R15).y, -(*R15).x);
+    (*R15) = ((*R7) + C16A * (*R15)) + C16B * T((*R15).y, -(*R15).x);
     (*R7)  = 2.0 * (*R7) - (*R15);
 
     res    = (*R1);
diff -Nru rocfft-5.5.0/library/src/device/generator/rtc_radix_functions/large_twiddles.h rocfft-5.7.1/library/src/device/generator/rtc_radix_functions/large_twiddles.h
--- rocfft-5.5.0/library/src/device/generator/rtc_radix_functions/large_twiddles.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/generator/rtc_radix_functions/large_twiddles.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
  ******************************************************************************/
 
 template <typename T, size_t Base, size_t Steps>
@@ -14,10 +14,10 @@
     {
         i += 1;
         j      = u & ((1 << Base) - 1);
-        result = lib_make_vector2<T>((result.x * twiddles[(1 << Base) * i + j].x
-                                      - result.y * twiddles[(1 << Base) * i + j].y),
-                                     (result.y * twiddles[(1 << Base) * i + j].x
-                                      + result.x * twiddles[(1 << Base) * i + j].y));
+        result = T((result.x * twiddles[(1 << Base) * i + j].x
+                    - result.y * twiddles[(1 << Base) * i + j].y),
+                   (result.y * twiddles[(1 << Base) * i + j].x
+                    + result.x * twiddles[(1 << Base) * i + j].y));
     }
     // static compiled
     if(Steps >= 3)
@@ -26,10 +26,10 @@
 
         i += 1;
         j      = u & ((1 << Base) - 1);
-        result = lib_make_vector2<T>((result.x * twiddles[(1 << Base) * i + j].x
-                                      - result.y * twiddles[(1 << Base) * i + j].y),
-                                     (result.y * twiddles[(1 << Base) * i + j].x
-                                      + result.x * twiddles[(1 << Base) * i + j].y));
+        result = T((result.x * twiddles[(1 << Base) * i + j].x
+                    - result.y * twiddles[(1 << Base) * i + j].y),
+                   (result.y * twiddles[(1 << Base) * i + j].x
+                    + result.x * twiddles[(1 << Base) * i + j].y));
     }
     static_assert(Steps < 4, "4-steps is not support");
     // if(Steps >= 4){...}
diff -Nru rocfft-5.5.0/library/src/device/generator/rtc_radix_functions/radix_16.h rocfft-5.7.1/library/src/device/generator/rtc_radix_functions/radix_16.h
--- rocfft-5.5.0/library/src/device/generator/rtc_radix_functions/radix_16.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/generator/rtc_radix_functions/radix_16.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
  ******************************************************************************/
 
 template <typename T>
@@ -42,57 +42,57 @@
 
     (*R2)  = (*R0) - (*R2);
     (*R0)  = 2.0 * (*R0) - (*R2);
-    (*R3)  = (*R1) + lib_make_vector2<T>(-(*R3).y, (*R3).x);
+    (*R3)  = (*R1) + T(-(*R3).y, (*R3).x);
     (*R1)  = 2.0 * (*R1) - (*R3);
     (*R6)  = (*R4) - (*R6);
     (*R4)  = 2.0 * (*R4) - (*R6);
-    (*R7)  = (*R5) + lib_make_vector2<T>(-(*R7).y, (*R7).x);
+    (*R7)  = (*R5) + T(-(*R7).y, (*R7).x);
     (*R5)  = 2.0 * (*R5) - (*R7);
     (*R10) = (*R8) - (*R10);
     (*R8)  = 2.0 * (*R8) - (*R10);
-    (*R11) = (*R9) + lib_make_vector2<T>(-(*R11).y, (*R11).x);
+    (*R11) = (*R9) + T(-(*R11).y, (*R11).x);
     (*R9)  = 2.0 * (*R9) - (*R11);
     (*R14) = (*R12) - (*R14);
     (*R12) = 2.0 * (*R12) - (*R14);
-    (*R15) = (*R13) + lib_make_vector2<T>(-(*R15).y, (*R15).x);
+    (*R15) = (*R13) + T(-(*R15).y, (*R15).x);
     (*R13) = 2.0 * (*R13) - (*R15);
 
     (*R4)  = (*R0) - (*R4);
     (*R0)  = 2.0 * (*R0) - (*R4);
-    (*R5)  = ((*R1) - C8Q * (*R5)) - C8Q * lib_make_vector2<T>((*R5).y, -(*R5).x);
+    (*R5)  = ((*R1) - C8Q * (*R5)) - C8Q * T((*R5).y, -(*R5).x);
     (*R1)  = 2.0 * (*R1) - (*R5);
-    (*R6)  = (*R2) + lib_make_vector2<T>(-(*R6).y, (*R6).x);
+    (*R6)  = (*R2) + T(-(*R6).y, (*R6).x);
     (*R2)  = 2.0 * (*R2) - (*R6);
-    (*R7)  = ((*R3) + C8Q * (*R7)) - C8Q * lib_make_vector2<T>((*R7).y, -(*R7).x);
+    (*R7)  = ((*R3) + C8Q * (*R7)) - C8Q * T((*R7).y, -(*R7).x);
     (*R3)  = 2.0 * (*R3) - (*R7);
     (*R12) = (*R8) - (*R12);
     (*R8)  = 2.0 * (*R8) - (*R12);
-    (*R13) = ((*R9) - C8Q * (*R13)) - C8Q * lib_make_vector2<T>((*R13).y, -(*R13).x);
+    (*R13) = ((*R9) - C8Q * (*R13)) - C8Q * T((*R13).y, -(*R13).x);
     (*R9)  = 2.0 * (*R9) - (*R13);
-    (*R14) = (*R10) + lib_make_vector2<T>(-(*R14).y, (*R14).x);
+    (*R14) = (*R10) + T(-(*R14).y, (*R14).x);
     (*R10) = 2.0 * (*R10) - (*R14);
-    (*R15) = ((*R11) + C8Q * (*R15)) - C8Q * lib_make_vector2<T>((*R15).y, -(*R15).x);
+    (*R15) = ((*R11) + C8Q * (*R15)) - C8Q * T((*R15).y, -(*R15).x);
     (*R11) = 2.0 * (*R11) - (*R15);
 
     (*R8) = (*R0) - (*R8);
     (*R0) = 2.0 * (*R0) - (*R8);
-    (*R9) = ((*R1) - C16A * (*R9)) - C16B * lib_make_vector2<T>((*R9).y, -(*R9).x);
+    (*R9) = ((*R1) - C16A * (*R9)) - C16B * T((*R9).y, -(*R9).x);
     res   = (*R8);
     (*R1) = 2.0 * (*R1) - (*R9);
 
-    (*R10) = ((*R2) - C8Q * (*R10)) - C8Q * lib_make_vector2<T>((*R10).y, -(*R10).x);
+    (*R10) = ((*R2) - C8Q * (*R10)) - C8Q * T((*R10).y, -(*R10).x);
     (*R2)  = 2.0 * (*R2) - (*R10);
-    (*R11) = ((*R3) - C16B * (*R11)) - C16A * lib_make_vector2<T>((*R11).y, -(*R11).x);
+    (*R11) = ((*R3) - C16B * (*R11)) - C16A * T((*R11).y, -(*R11).x);
     (*R3)  = 2.0 * (*R3) - (*R11);
 
-    (*R12) = (*R4) + lib_make_vector2<T>(-(*R12).y, (*R12).x);
+    (*R12) = (*R4) + T(-(*R12).y, (*R12).x);
     (*R4)  = 2.0 * (*R4) - (*R12);
-    (*R13) = ((*R5) + C16B * (*R13)) - C16A * lib_make_vector2<T>((*R13).y, -(*R13).x);
+    (*R13) = ((*R5) + C16B * (*R13)) - C16A * T((*R13).y, -(*R13).x);
     (*R5)  = 2.0 * (*R5) - (*R13);
 
-    (*R14) = ((*R6) + C8Q * (*R14)) - C8Q * lib_make_vector2<T>((*R14).y, -(*R14).x);
+    (*R14) = ((*R6) + C8Q * (*R14)) - C8Q * T((*R14).y, -(*R14).x);
     (*R6)  = 2.0 * (*R6) - (*R14);
-    (*R15) = ((*R7) + C16A * (*R15)) - C16B * lib_make_vector2<T>((*R15).y, -(*R15).x);
+    (*R15) = ((*R7) + C16A * (*R15)) - C16B * T((*R15).y, -(*R15).x);
     (*R7)  = 2.0 * (*R7) - (*R15);
 
     res    = (*R1);
@@ -155,53 +155,53 @@
 
     (*R2)  = (*R0) - (*R2);
     (*R0)  = 2.0 * (*R0) - (*R2);
-    (*R3)  = (*R1) + lib_make_vector2<T>((*R3).y, -(*R3).x);
+    (*R3)  = (*R1) + T((*R3).y, -(*R3).x);
     (*R1)  = 2.0 * (*R1) - (*R3);
     (*R6)  = (*R4) - (*R6);
     (*R4)  = 2.0 * (*R4) - (*R6);
-    (*R7)  = (*R5) + lib_make_vector2<T>((*R7).y, -(*R7).x);
+    (*R7)  = (*R5) + T((*R7).y, -(*R7).x);
     (*R5)  = 2.0 * (*R5) - (*R7);
     (*R10) = (*R8) - (*R10);
     (*R8)  = 2.0 * (*R8) - (*R10);
-    (*R11) = (*R9) + lib_make_vector2<T>((*R11).y, -(*R11).x);
+    (*R11) = (*R9) + T((*R11).y, -(*R11).x);
     (*R9)  = 2.0 * (*R9) - (*R11);
     (*R14) = (*R12) - (*R14);
     (*R12) = 2.0 * (*R12) - (*R14);
-    (*R15) = (*R13) + lib_make_vector2<T>((*R15).y, -(*R15).x);
+    (*R15) = (*R13) + T((*R15).y, -(*R15).x);
     (*R13) = 2.0 * (*R13) - (*R15);
 
     (*R4)  = (*R0) - (*R4);
     (*R0)  = 2.0 * (*R0) - (*R4);
-    (*R5)  = ((*R1) - C8Q * (*R5)) + C8Q * lib_make_vector2<T>((*R5).y, -(*R5).x);
+    (*R5)  = ((*R1) - C8Q * (*R5)) + C8Q * T((*R5).y, -(*R5).x);
     (*R1)  = 2.0 * (*R1) - (*R5);
-    (*R6)  = (*R2) + lib_make_vector2<T>((*R6).y, -(*R6).x);
+    (*R6)  = (*R2) + T((*R6).y, -(*R6).x);
     (*R2)  = 2.0 * (*R2) - (*R6);
-    (*R7)  = ((*R3) + C8Q * (*R7)) + C8Q * lib_make_vector2<T>((*R7).y, -(*R7).x);
+    (*R7)  = ((*R3) + C8Q * (*R7)) + C8Q * T((*R7).y, -(*R7).x);
     (*R3)  = 2.0 * (*R3) - (*R7);
     (*R12) = (*R8) - (*R12);
     (*R8)  = 2.0 * (*R8) - (*R12);
-    (*R13) = ((*R9) - C8Q * (*R13)) + C8Q * lib_make_vector2<T>((*R13).y, -(*R13).x);
+    (*R13) = ((*R9) - C8Q * (*R13)) + C8Q * T((*R13).y, -(*R13).x);
     (*R9)  = 2.0 * (*R9) - (*R13);
-    (*R14) = (*R10) + lib_make_vector2<T>((*R14).y, -(*R14).x);
+    (*R14) = (*R10) + T((*R14).y, -(*R14).x);
     (*R10) = 2.0 * (*R10) - (*R14);
-    (*R15) = ((*R11) + C8Q * (*R15)) + C8Q * lib_make_vector2<T>((*R15).y, -(*R15).x);
+    (*R15) = ((*R11) + C8Q * (*R15)) + C8Q * T((*R15).y, -(*R15).x);
     (*R11) = 2.0 * (*R11) - (*R15);
 
     (*R8)  = (*R0) - (*R8);
     (*R0)  = 2.0 * (*R0) - (*R8);
-    (*R9)  = ((*R1) - C16A * (*R9)) + C16B * lib_make_vector2<T>((*R9).y, -(*R9).x);
+    (*R9)  = ((*R1) - C16A * (*R9)) + C16B * T((*R9).y, -(*R9).x);
     (*R1)  = 2.0 * (*R1) - (*R9);
-    (*R10) = ((*R2) - C8Q * (*R10)) + C8Q * lib_make_vector2<T>((*R10).y, -(*R10).x);
+    (*R10) = ((*R2) - C8Q * (*R10)) + C8Q * T((*R10).y, -(*R10).x);
     (*R2)  = 2.0 * (*R2) - (*R10);
-    (*R11) = ((*R3) - C16B * (*R11)) + C16A * lib_make_vector2<T>((*R11).y, -(*R11).x);
+    (*R11) = ((*R3) - C16B * (*R11)) + C16A * T((*R11).y, -(*R11).x);
     (*R3)  = 2.0 * (*R3) - (*R11);
-    (*R12) = (*R4) + lib_make_vector2<T>((*R12).y, -(*R12).x);
+    (*R12) = (*R4) + T((*R12).y, -(*R12).x);
     (*R4)  = 2.0 * (*R4) - (*R12);
-    (*R13) = ((*R5) + C16B * (*R13)) + C16A * lib_make_vector2<T>((*R13).y, -(*R13).x);
+    (*R13) = ((*R5) + C16B * (*R13)) + C16A * T((*R13).y, -(*R13).x);
     (*R5)  = 2.0 * (*R5) - (*R13);
-    (*R14) = ((*R6) + C8Q * (*R14)) + C8Q * lib_make_vector2<T>((*R14).y, -(*R14).x);
+    (*R14) = ((*R6) + C8Q * (*R14)) + C8Q * T((*R14).y, -(*R14).x);
     (*R6)  = 2.0 * (*R6) - (*R14);
-    (*R15) = ((*R7) + C16A * (*R15)) + C16B * lib_make_vector2<T>((*R15).y, -(*R15).x);
+    (*R15) = ((*R7) + C16A * (*R15)) + C16B * T((*R15).y, -(*R15).x);
     (*R7)  = 2.0 * (*R7) - (*R15);
 
     res    = (*R1);
diff -Nru rocfft-5.5.0/library/src/device/generator/rtc_radix_functions/radix_4.h rocfft-5.7.1/library/src/device/generator/rtc_radix_functions/radix_4.h
--- rocfft-5.5.0/library/src/device/generator/rtc_radix_functions/radix_4.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/generator/rtc_radix_functions/radix_4.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
  ******************************************************************************/
 
 template <typename T>
@@ -16,7 +16,7 @@
     (*R2) = (*R0) - (*R2);
     (*R0) = 2.0 * (*R0) - (*R2);
 
-    (*R3) = (*R1) + lib_make_vector2<T>(-(*R3).y, (*R3).x);
+    (*R3) = (*R1) + T(-(*R3).y, (*R3).x);
     (*R1) = 2.0 * (*R1) - (*R3);
 
     res   = (*R1);
@@ -37,7 +37,7 @@
 
     (*R2) = (*R0) - (*R2);
     (*R0) = 2.0 * (*R0) - (*R2);
-    (*R3) = (*R1) + lib_make_vector2<T>((*R3).y, -(*R3).x);
+    (*R3) = (*R1) + T((*R3).y, -(*R3).x);
     (*R1) = 2.0 * (*R1) - (*R3);
 
     res   = (*R1);
diff -Nru rocfft-5.5.0/library/src/device/generator/rtc_radix_functions/radix_8.h rocfft-5.7.1/library/src/device/generator/rtc_radix_functions/radix_8.h
--- rocfft-5.5.0/library/src/device/generator/rtc_radix_functions/radix_8.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/generator/rtc_radix_functions/radix_8.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
  ******************************************************************************/
 
 template <typename T>
@@ -19,21 +19,21 @@
 
     (*R2) = (*R0) - (*R2);
     (*R0) = 2.0 * (*R0) - (*R2);
-    (*R3) = (*R1) + lib_make_vector2<T>(-(*R3).y, (*R3).x);
+    (*R3) = (*R1) + T(-(*R3).y, (*R3).x);
     (*R1) = 2.0 * (*R1) - (*R3);
     (*R6) = (*R4) - (*R6);
     (*R4) = 2.0 * (*R4) - (*R6);
-    (*R7) = (*R5) + lib_make_vector2<T>(-(*R7).y, (*R7).x);
+    (*R7) = (*R5) + T(-(*R7).y, (*R7).x);
 
     (*R5) = 2.0 * (*R5) - (*R7);
 
     (*R4) = (*R0) - (*R4);
     (*R0) = 2.0 * (*R0) - (*R4);
-    (*R5) = ((*R1) - C8Q * (*R5)) - C8Q * lib_make_vector2<T>((*R5).y, -(*R5).x);
+    (*R5) = ((*R1) - C8Q * (*R5)) - C8Q * T((*R5).y, -(*R5).x);
     (*R1) = 2.0 * (*R1) - (*R5);
-    (*R6) = (*R2) + lib_make_vector2<T>(-(*R6).y, (*R6).x);
+    (*R6) = (*R2) + T(-(*R6).y, (*R6).x);
     (*R2) = 2.0 * (*R2) - (*R6);
-    (*R7) = ((*R3) + C8Q * (*R7)) - C8Q * lib_make_vector2<T>((*R7).y, -(*R7).x);
+    (*R7) = ((*R3) + C8Q * (*R7)) - C8Q * T((*R7).y, -(*R7).x);
     (*R3) = 2.0 * (*R3) - (*R7);
 
     res   = (*R1);
@@ -61,20 +61,20 @@
 
     (*R2) = (*R0) - (*R2);
     (*R0) = 2.0 * (*R0) - (*R2);
-    (*R3) = (*R1) + lib_make_vector2<T>((*R3).y, -(*R3).x);
+    (*R3) = (*R1) + T((*R3).y, -(*R3).x);
     (*R1) = 2.0 * (*R1) - (*R3);
     (*R6) = (*R4) - (*R6);
     (*R4) = 2.0 * (*R4) - (*R6);
-    (*R7) = (*R5) + lib_make_vector2<T>((*R7).y, -(*R7).x);
+    (*R7) = (*R5) + T((*R7).y, -(*R7).x);
     (*R5) = 2.0 * (*R5) - (*R7);
 
     (*R4) = (*R0) - (*R4);
     (*R0) = 2.0 * (*R0) - (*R4);
-    (*R5) = ((*R1) - C8Q * (*R5)) + C8Q * lib_make_vector2<T>((*R5).y, -(*R5).x);
+    (*R5) = ((*R1) - C8Q * (*R5)) + C8Q * T((*R5).y, -(*R5).x);
     (*R1) = 2.0 * (*R1) - (*R5);
-    (*R6) = (*R2) + lib_make_vector2<T>((*R6).y, -(*R6).x);
+    (*R6) = (*R2) + T((*R6).y, -(*R6).x);
     (*R2) = 2.0 * (*R2) - (*R6);
-    (*R7) = ((*R3) + C8Q * (*R7)) + C8Q * lib_make_vector2<T>((*R7).y, -(*R7).x);
+    (*R7) = ((*R3) + C8Q * (*R7)) + C8Q * T((*R7).y, -(*R7).x);
     (*R3) = 2.0 * (*R3) - (*R7);
 
     res   = (*R1);
diff -Nru rocfft-5.5.0/library/src/device/generator/rtc_radix_functions/radix_9.h rocfft-5.7.1/library/src/device/generator/rtc_radix_functions/radix_9.h
--- rocfft-5.5.0/library/src/device/generator/rtc_radix_functions/radix_9.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/generator/rtc_radix_functions/radix_9.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
  ******************************************************************************/
 
 template <typename T>
@@ -20,25 +20,24 @@
     // borrow R8 as temp
     (*R8) = (C9QB * p0) + (C9QD * p1) + (p2) + (C9QH * ((*R4) - (*R5)));
     (*R1) = ((*R0) + (C9QA * v0) + (C9QC * v1) - (C9QE * v2) - (C9QG * ((*R4) + (*R5))))
-            + lib_make_vector2<T>((*R8).y, -(*R8).x);
-    (*R8) = (*R1) + 2.0 * lib_make_vector2<T>(-(*R8).y, (*R8).x);
+            + T((*R8).y, -(*R8).x);
+    (*R8) = (*R1) + 2.0 * T(-(*R8).y, (*R8).x);
     // borrow R7 as temp
     (*R7) = -(C9QB * ((*R4) - (*R5))) + (C9QD * p0) - (p2) + (C9QH * p1);
     (*R2) = ((*R0) + (C9QA * ((*R4) + (*R5))) + (C9QC * v0) - (C9QE * v2) - (C9QG * v1))
-            + lib_make_vector2<T>((*R7).y, -(*R7).x);
-    (*R7) = (*R2) + 2.0 * lib_make_vector2<T>(-(*R7).y, (*R7).x);
+            + T((*R7).y, -(*R7).x);
+    (*R7) = (*R2) + 2.0 * T(-(*R7).y, (*R7).x);
     // borrow R6 temp
     (*R6) = C9QF * (p0 + ((*R4) - (*R5)) - p1);
-    (*R3) = ((*R0) + v2 - C9QE * (v0 + v1 + ((*R4) + (*R5))))
-            + lib_make_vector2<T>((*R6).y, -(*R6).x);
-    (*R6) = (*R3) + 2.0 * lib_make_vector2<T>(-(*R6).y, (*R6).x);
+    (*R3) = ((*R0) + v2 - C9QE * (v0 + v1 + ((*R4) + (*R5)))) + T((*R6).y, -(*R6).x);
+    (*R6) = (*R3) + 2.0 * T(-(*R6).y, (*R6).x);
     // borrow p0 as temp
     p0 = -(C9QB * p1) - (C9QD * ((*R4) - (*R5))) + (p2) + (C9QH * p0);
     p1 = (*R0);
     (*R0) += (v0 + v1 + v2 + (*R4) + (*R5));
     (*R4) = (p1 + (C9QA * v1) + (C9QC * ((*R4) + (*R5))) - (C9QE * v2) - (C9QG * v0))
-            + lib_make_vector2<T>(p0.y, -p0.x);
-    (*R5) = (*R4) + 2.0 * lib_make_vector2<T>(-p0.y, p0.x);
+            + T(p0.y, -p0.x);
+    (*R5) = (*R4) + 2.0 * T(-p0.y, p0.x);
 }
 
 template <typename T>
@@ -57,23 +56,22 @@
     // borrow R8 as temp
     (*R8) = (C9QB * p0) + (C9QD * p1) + (p2) + (C9QH * ((*R4) - (*R5)));
     (*R1) = ((*R0) + (C9QA * v0) + (C9QC * v1) - (C9QE * v2) - (C9QG * ((*R4) + (*R5))))
-            + lib_make_vector2<T>(-(*R8).y, (*R8).x);
-    (*R8) = (*R1) + 2.0 * lib_make_vector2<T>((*R8).y, -(*R8).x);
+            + T(-(*R8).y, (*R8).x);
+    (*R8) = (*R1) + 2.0 * T((*R8).y, -(*R8).x);
     // borrow R7 as temp
     (*R7) = -(C9QB * ((*R4) - (*R5))) + (C9QD * p0) - (p2) + (C9QH * p1);
     (*R2) = ((*R0) + (C9QA * ((*R4) + (*R5))) + (C9QC * v0) - (C9QE * v2) - (C9QG * v1))
-            + lib_make_vector2<T>(-(*R7).y, (*R7).x);
-    (*R7) = (*R2) + 2.0 * lib_make_vector2<T>((*R7).y, -(*R7).x);
+            + T(-(*R7).y, (*R7).x);
+    (*R7) = (*R2) + 2.0 * T((*R7).y, -(*R7).x);
     // borrow R6 temp
     (*R6) = C9QF * (p0 + ((*R4) - (*R5)) - p1);
-    (*R3) = ((*R0) + v2 - C9QE * (v0 + v1 + ((*R4) + (*R5))))
-            + lib_make_vector2<T>(-(*R6).y, (*R6).x);
-    (*R6) = (*R3) + 2.0 * lib_make_vector2<T>((*R6).y, -(*R6).x);
+    (*R3) = ((*R0) + v2 - C9QE * (v0 + v1 + ((*R4) + (*R5)))) + T(-(*R6).y, (*R6).x);
+    (*R6) = (*R3) + 2.0 * T((*R6).y, -(*R6).x);
     // borrow p0 as temp
     p0 = -(C9QB * p1) - (C9QD * ((*R4) - (*R5))) + (p2) + (C9QH * p0);
     p1 = (*R0);
     (*R0) += (v0 + v1 + v2 + (*R4) + (*R5));
     (*R4) = (p1 + (C9QA * v1) + (C9QC * ((*R4) + (*R5))) - (C9QE * v2) - (C9QG * v0))
-            + lib_make_vector2<T>(-p0.y, p0.x);
-    (*R5) = (*R4) + 2.0 * lib_make_vector2<T>(p0.y, -p0.x);
+            + T(-p0.y, p0.x);
+    (*R5) = (*R4) + 2.0 * T(p0.y, -p0.x);
 }
diff -Nru rocfft-5.5.0/library/src/device/generator/rtc_workarounds.h rocfft-5.7.1/library/src/device/generator/rtc_workarounds.h
--- rocfft-5.5.0/library/src/device/generator/rtc_workarounds.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/generator/rtc_workarounds.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,46 +0,0 @@
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-// complex number operators that are not present in hipRTC
-
-#ifndef ROCFFT_RTC_WORKAROUND_H
-#define ROCFFT_RTC_WORKAROUND_H
-
-__device__ float2& operator*=(float2& f2, const float f)
-{
-    return f2 *= float2{f};
-}
-
-__device__ double2& operator*=(double2& f2, const double f)
-{
-    return f2 *= double2{f};
-}
-
-__device__ float2 operator-(float2 f2)
-{
-    return float2{-f2.x, -f2.y};
-}
-
-__device__ double2 operator-(double2 f2)
-{
-    return double2{-f2.x, -f2.y};
-}
-
-#endif // ROCFFT_RTC_WORKAROUND_H
diff -Nru rocfft-5.5.0/library/src/device/generator/stockham_gen.cpp rocfft-5.7.1/library/src/device/generator/stockham_gen.cpp
--- rocfft-5.5.0/library/src/device/generator/stockham_gen.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/generator/stockham_gen.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -305,7 +305,7 @@
     }
     else if(specs.scheme == "CS_KERNEL_STOCKHAM_BLOCK_CC")
     {
-        StockhamKernelCC kernel(specs, false);
+        StockhamKernelCC kernel(specs, false, false);
         output += append_common_functions(kernel.generate_lds_to_reg_input_function(),
                                           kernel.generate_lds_from_reg_output_function(),
                                           {},
@@ -323,7 +323,7 @@
     }
     else if(specs.scheme == "CS_KERNEL_STOCKHAM_BLOCK_RC")
     {
-        StockhamKernelRC kernel(specs);
+        StockhamKernelRC kernel(specs, false);
         output += append_common_functions(kernel.generate_lds_to_reg_input_function(),
                                           kernel.generate_lds_from_reg_output_function(),
                                           {},
@@ -332,7 +332,7 @@
             kernel.generate_device_function(), {}, kernel.generate_global_function(), false);
 
         std::vector<LaunchSuffix> suffixes;
-        suffixes.push_back({"sbrc", "CS_KERNEL_STOCKHAM_BLOCK_RC", "SBRC_2D", "NONE"});
+        suffixes.push_back({"sbrc", "CS_KERNEL_STOCKHAM_BLOCK_RC", "SBRC_2D", "TILE_ALIGNED"});
         suffixes.push_back(
             {"sbrc_unaligned", "CS_KERNEL_STOCKHAM_BLOCK_RC", "SBRC_2D", "TILE_UNALIGNED"});
         suffixes.push_back({"sbrc3d_fft_trans_xy_z_tile_aligned",
diff -Nru rocfft-5.5.0/library/src/device/generator/stockham_gen.h rocfft-5.7.1/library/src/device/generator/stockham_gen.h
--- rocfft-5.5.0/library/src/device/generator/stockham_gen.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/generator/stockham_gen.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -58,6 +58,13 @@
     // statically defined for the kernel
     unsigned int static_dim = 0;
     std::string  scheme;
+
+    // this value indicating if the wgs, tpt are excatly what we want
+    // (i.e. were already derived somewhere)
+    // to tell StockhamKernel not to do its auto-derivation again.
+    // Particularly useful when tuning or running a tuned kernel. (RTC-ing)
+    // We don't want them to be overwritten by StockhamKernel.
+    bool wgs_is_derived = false;
 };
 
 // generate default stockham variants for ahead-of-time compilation
diff -Nru rocfft-5.5.0/library/src/device/generator/stockham_gen_base.h rocfft-5.7.1/library/src/device/generator/stockham_gen_base.h
--- rocfft-5.5.0/library/src/device/generator/stockham_gen_base.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/generator/stockham_gen_base.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -41,37 +41,46 @@
     StockhamKernel(const StockhamGeneratorSpecs& specs)
         : StockhamGeneratorSpecs(specs)
     {
-        auto bytes_per_batch = length * BYTES_PER_ELEMENT;
+        // RTC-ing kernels for tuning always goes this way
+        if(wgs_is_derived)
+        {
+            transforms_per_block = workgroup_size / threads_per_transform;
+        }
+        else
+        {
+            auto bytes_per_batch = length * BYTES_PER_ELEMENT;
 
-        if(half_lds)
-            bytes_per_batch /= 2;
+            if(half_lds)
+                bytes_per_batch /= 2;
 
-        if(threads_per_transform == 0)
-        {
-            threads_per_transform = 1;
-            for(unsigned int t = 2; t < length; ++t)
+            if(threads_per_transform == 0)
             {
-                if(t > workgroup_size)
-                    continue;
-                if(length % t == 0)
+                threads_per_transform = 1;
+                for(unsigned int t = 2; t < length; ++t)
                 {
-                    if(std::all_of(factors.begin(), factors.end(), [=](unsigned int f) {
-                           return (length / t) % f == 0;
-                       }))
-                        threads_per_transform = t;
+                    if(t > workgroup_size)
+                        continue;
+                    if(length % t == 0)
+                    {
+                        if(std::all_of(factors.begin(), factors.end(), [=](unsigned int f) {
+                               return (length / t) % f == 0;
+                           }))
+                            threads_per_transform = t;
+                    }
                 }
             }
+
+            transforms_per_block = LDS_BYTE_LIMIT / bytes_per_batch;
+            while(threads_per_transform * transforms_per_block > workgroup_size)
+                --transforms_per_block;
+            if(!factors2d.empty())
+                transforms_per_block = std::min(transforms_per_block, length2d);
+
+            workgroup_size = threads_per_transform * transforms_per_block;
         }
 
-        transforms_per_block = LDS_BYTE_LIMIT / bytes_per_batch;
-        while(threads_per_transform * transforms_per_block > workgroup_size)
-            --transforms_per_block;
-        if(!factors2d.empty())
-            transforms_per_block = std::min(transforms_per_block, length2d);
-
-        workgroup_size = threads_per_transform * transforms_per_block;
-        nregisters     = compute_nregisters(length, factors, threads_per_transform);
-        R.size         = Expression{nregisters};
+        nregisters = compute_nregisters(length, factors, threads_per_transform);
+        R.size     = Expression{nregisters};
     }
     virtual ~StockhamKernel(){};
 
@@ -180,6 +189,16 @@
     // current transform index in a batch
     Variable transform{"transform", "size_t"};
 
+    // data index and offsets (for contiguous read/write)
+    Variable global_data_id{"global_data_id", "size_t"};
+    Variable global_load_data_offset{"global_load_data_offset", "size_t"};
+    Variable global_store_data_offset{"global_store_data_offset", "size_t"};
+
+    // transform index and offsets
+    Variable global_transf_id{"global_transf_id", "size_t"};
+    Variable global_load_transf_offset{"global_load_transf_offset", "size_t"};
+    Variable global_store_transf_offset{"global_store_transf_offset", "size_t"};
+
     // stride between consecutive indexes
     Variable stride0{"stride0", "const size_t"};
 
@@ -349,10 +368,10 @@
             switch(component)
             {
             case Component::REAL:
-                work += Assign(R[hr * width + w].x, lds_real[l_offset]);
+                work += Assign(R[hr * width + w].x(), lds_real[l_offset]);
                 break;
             case Component::IMAG:
-                work += Assign(R[hr * width + w].y, lds_real[l_offset]);
+                work += Assign(R[hr * width + w].y(), lds_real[l_offset]);
                 break;
             case Component::BOTH:
                 work += Assign(R[hr * width + w], lds_complex[l_offset]);
@@ -391,10 +410,10 @@
             switch(component)
             {
             case Component::REAL:
-                work += Assign(lds_real[l_offset], R[hr * width + w].x);
+                work += Assign(lds_real[l_offset], R[hr * width + w].x());
                 break;
             case Component::IMAG:
-                work += Assign(lds_real[l_offset], R[hr * width + w].y);
+                work += Assign(lds_real[l_offset], R[hr * width + w].y());
                 break;
             case Component::BOTH:
                 work += Assign(lds_complex[l_offset], R[hr * width + w]);
diff -Nru rocfft-5.5.0/library/src/device/generator/stockham_gen_cc.h rocfft-5.7.1/library/src/device/generator/stockham_gen_cc.h
--- rocfft-5.5.0/library/src/device/generator/stockham_gen_cc.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/generator/stockham_gen_cc.h	2023-08-09 16:19:51.000000000 +0000
@@ -24,14 +24,19 @@
 struct StockhamKernelCC : public StockhamKernel
 {
     explicit StockhamKernelCC(const StockhamGeneratorSpecs& specs,
-                              bool                          largeTwdBatchIsTransformCount)
+                              bool                          largeTwdBatchIsTransformCount,
+                              bool                          emitGlobalId)
         : StockhamKernel(specs)
         , largeTwdBatchIsTransformCount(largeTwdBatchIsTransformCount)
+        , emitGlobalId(emitGlobalId)
     {
         large_twiddle_steps.decl_default = 3;
         large_twiddle_base.decl_default  = 8;
     }
 
+    bool largeTwdBatchIsTransformCount = false;
+    bool emitGlobalId                  = false;
+
     //
     // templates
     //
@@ -39,7 +44,6 @@
     Variable apply_large_twiddle{"apply_large_twiddle", "bool"};
     Variable large_twiddle_steps{"large_twiddle_steps", "size_t"};
     Variable large_twiddle_base{"large_twiddle_base", "size_t"};
-    bool     largeTwdBatchIsTransformCount = false;
 
     //
     // arguments
@@ -55,6 +59,9 @@
     Variable in_bound{"in_bound", "bool"};
     Variable thread{"thread", "unsigned int"}; // replacing tid_ver
     Variable tid_hor{"tid_hor", "unsigned int"}; // id along row
+    Variable stride_in{"stride_in", "const size_t", true};
+    Variable stride_out{"stride_out", "const size_t", true};
+    Variable length_M_blue{"length_M_blue", "const size_t"};
 
     // large twiddle support
     Multiply ltwd_entries{Parens{ShiftLeft{1, large_twiddle_base}}, 3};
@@ -101,10 +108,22 @@
         if(hr == 0)
             hr = h;
         StatementList load;
+
         for(unsigned int w = 0; w < width; ++w)
         {
             auto tid = Parens{thread + dt + h * threads_per_transform};
             auto idx = Parens{tid + w * length / width};
+
+            if(emitGlobalId)
+            {
+                load += Assign{global_data_id,
+                               global_load_data_offset + tid_hor
+                                   + Parens{Expression{idx}} * lengths[1]};
+                load += Assign{global_transf_id,
+                               global_load_transf_offset + tid_hor
+                                   + Parens{Expression{idx}} * lengths[1]};
+            }
+
             if(intrinsic)
             {
                 // no need to and with trivial "true"
@@ -145,6 +164,16 @@
             auto idx
                 = Parens{tid / cumheight} * (width * cumheight) + tid % cumheight + w * cumheight;
 
+            if(emitGlobalId)
+            {
+                work += Assign{global_data_id,
+                               global_store_data_offset + tid_hor
+                                   + Parens{Expression{idx}} * lengths[1]};
+                work += Assign{global_transf_id,
+                               global_store_transf_offset + tid_hor
+                                   + Parens{Expression{idx}} * lengths[1]};
+            }
+
             if(intrinsic)
             {
                 // no need to and with trivial "true"
@@ -172,6 +201,8 @@
         Variable index_along_d{"index_along_d", "size_t"};
         Variable remaining{"remaining", "size_t"};
         Variable plength{"plength", "size_t"};
+        Variable global_stride_in{"global_stride_in", "const size_t"};
+        Variable global_stride_out{"global_stride_out", "const size_t"};
 
         StatementList stmts;
         stmts += Declaration{tile_index};
@@ -184,20 +215,55 @@
         stmts += Declaration{plength, 1};
         stmts += Declaration{remaining};
         stmts += Declaration{index_along_d};
+        if(emitGlobalId)
+        {
+            stmts += Declaration{global_load_data_offset, 0};
+            stmts += Declaration{global_load_transf_offset, 0};
+            stmts += Declaration{global_store_data_offset, 0};
+            stmts += Declaration{global_store_transf_offset, 0};
+
+            stmts += Declaration{global_data_id, 0};
+            stmts += Declaration{global_transf_id, 0};
+        }
         stmts += Assign{num_of_tiles, (lengths[1] - 1) / transforms_per_block + 1};
         stmts += Assign{plength, num_of_tiles};
         stmts += Assign{tile_index, block_id % num_of_tiles};
         stmts += Assign{remaining, block_id / num_of_tiles};
         stmts += Assign{offset, tile_index * transforms_per_block * stride[1]};
 
-        stmts += For{d,
-                     2,
-                     d < dim,
-                     1,
-                     {Assign{plength, plength * lengths[d]},
-                      Assign{index_along_d, remaining % lengths[d]},
-                      Assign{remaining, remaining / lengths[d]},
-                      Assign{offset, offset + index_along_d * stride[d]}}};
+        if(emitGlobalId)
+        {
+            stmts += Declaration{Variable{"global_stride_in[3]", "const size_t"},
+                                 Literal{"{global_stride_in_0, global_stride_in_1, global_idist}"}};
+            stmts
+                += Declaration{Variable{"global_stride_out[3]", "const size_t"},
+                               Literal{"{global_stride_out_0, global_stride_out_1, global_odist}"}};
+
+            stmts += For{
+                d,
+                2,
+                d < dim,
+                1,
+                {Assign{plength, plength * lengths[d]},
+                 Assign{index_along_d, remaining % lengths[d]},
+                 Assign{remaining, remaining / lengths[d]},
+                 Assign{offset, offset + index_along_d * stride[d]},
+                 Assign{global_load_data_offset,
+                        global_load_data_offset + index_along_d * global_stride_in[d - 2]},
+                 Assign{global_store_data_offset,
+                        global_store_data_offset + index_along_d * global_stride_out[d - 2]}}};
+        }
+        else
+        {
+            stmts += For{d,
+                         2,
+                         d < dim,
+                         1,
+                         {Assign{plength, plength * lengths[d]},
+                          Assign{index_along_d, remaining % lengths[d]},
+                          Assign{remaining, remaining / lengths[d]},
+                          Assign{offset, offset + index_along_d * stride[d]}}};
+        }
 
         stmts += LineBreak{};
 
@@ -237,6 +303,20 @@
         stmts += Declaration{thread, thread_id / transforms_per_block};
         stmts += Declaration{tid_hor, thread_id % transforms_per_block};
 
+        if(emitGlobalId)
+        {
+            stmts += Assign{global_load_data_offset,
+                            global_load_data_offset + tile_index * transforms_per_block
+                                + batch * global_stride_in[2]};
+            stmts += Assign{global_load_transf_offset,
+                            global_load_transf_offset + tile_index * transforms_per_block};
+            stmts += Assign{global_store_data_offset,
+                            global_store_data_offset + tile_index * transforms_per_block
+                                + batch * global_stride_out[2]};
+            stmts += Assign{global_store_transf_offset,
+                            global_store_transf_offset + tile_index * transforms_per_block};
+        }
+
         return stmts;
     }
 
@@ -259,8 +339,20 @@
             };
 
             for(unsigned int i = 0; i < length / stripmine_h; ++i)
+            {
+                if(emitGlobalId)
+                {
+                    tmp_stmts += Assign{global_data_id,
+                                        global_load_data_offset + tid_hor
+                                            + (thread + i * stripmine_h) * lengths[1]};
+                    tmp_stmts += Assign{global_transf_id,
+                                        global_load_transf_offset + tid_hor
+                                            + (thread + i * stripmine_h) * lengths[1]};
+                }
+
                 tmp_stmts += Assign{lds_complex[offset_tile_wlds(i)],
                                     LoadGlobal{buf, offset + offset_tile_rbuf(i)}};
+            }
 
             stmts += CommentLines{
                 "no intrinsic when load to lds. FIXME- check why use nested branch is better"};
@@ -330,8 +422,20 @@
             };
 
             for(unsigned int i = 0; i < length / stripmine_h; ++i)
+            {
+                if(emitGlobalId)
+                {
+                    tmp_stmts += Assign{global_data_id,
+                                        global_store_data_offset + tid_hor
+                                            + (thread + i * stripmine_h) * lengths[1]};
+                    tmp_stmts += Assign{global_transf_id,
+                                        global_store_transf_offset + tid_hor
+                                            + (thread + i * stripmine_h) * lengths[1]};
+                }
+
                 tmp_stmts += StoreGlobal{
                     buf, offset + offset_tile_wbuf(i), lds_complex[offset_tile_rlds(i)]};
+            }
 
             stmts += CommentLines{
                 "no intrinsic when store from lds. FIXME- check why use nested branch is better"};
diff -Nru rocfft-5.5.0/library/src/device/generator/stockham_gen_cr.h rocfft-5.7.1/library/src/device/generator/stockham_gen_cr.h
--- rocfft-5.5.0/library/src/device/generator/stockham_gen_cr.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/generator/stockham_gen_cr.h	2023-08-09 16:19:51.000000000 +0000
@@ -63,6 +63,12 @@
                     Declaration{lds_linear, Literal{"true"}}};
     }
 
+    StatementList set_lds_is_real() override
+    {
+        // SBCR can't support half-lds
+        return {Declaration{lds_is_real, Literal{"false"}}};
+    }
+
     StatementList load_global_generator(unsigned int h,
                                         unsigned int hr,
                                         unsigned int width,
diff -Nru rocfft-5.5.0/library/src/device/generator/stockham_gen_rc.h rocfft-5.7.1/library/src/device/generator/stockham_gen_rc.h
--- rocfft-5.5.0/library/src/device/generator/stockham_gen_rc.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/generator/stockham_gen_rc.h	2023-08-09 16:19:51.000000000 +0000
@@ -23,11 +23,14 @@
 
 struct StockhamKernelRC : public StockhamKernel
 {
-    explicit StockhamKernelRC(const StockhamGeneratorSpecs& specs)
+    explicit StockhamKernelRC(const StockhamGeneratorSpecs& specs, bool emitGlobalId)
         : StockhamKernel(specs)
+        , emitGlobalId(emitGlobalId)
     {
     }
 
+    bool emitGlobalId = false;
+
     //
     // templates
     //
@@ -89,7 +92,8 @@
 
     StatementList set_lds_is_real() override
     {
-        return {Declaration{lds_is_real, Literal{half_lds ? "true" : "false"}}};
+        // SBRC can't support half-lds
+        return {Declaration{lds_is_real, Literal{"false"}}};
     }
 
     StatementList store_global_generator(unsigned int h,
@@ -107,6 +111,17 @@
             auto tid = Parens{thread + dt + h * threads_per_transform};
             auto idx
                 = Parens{tid / cumheight} * (width * cumheight) + tid % cumheight + w * cumheight;
+
+            if(emitGlobalId)
+            {
+                work += Assign{global_data_id,
+                               global_store_data_offset + tid_hor
+                                   + Parens{Expression{idx}} * lengths[1]};
+                work += Assign{global_transf_id,
+                               global_store_transf_offset + tid_hor
+                                   + Parens{Expression{idx}} * lengths[1]};
+            }
+
             work += StoreGlobal{buf,
                                 offset + tid_hor * stride0
                                     + Parens{Expression{idx}} * stride_store_out,
@@ -129,6 +144,8 @@
 
         Variable plane_id{"plane_id", "unsigned int"};
         Variable tile_serial_in_batch{"tile_serial_in_batch", "unsigned int"};
+        Variable global_stride_in{"global_stride_in", "const size_t"};
+        Variable global_stride_out{"global_stride_out", "const size_t"};
 
         stmts += Declaration{
             len_along_block,
@@ -150,6 +167,17 @@
         stmts
             += Declaration{num_of_tiles_in_plane, (len_along_block - 1) / transforms_per_block + 1};
 
+        if(emitGlobalId)
+        {
+            stmts += Declaration{global_load_data_offset, 0};
+            stmts += Declaration{global_load_transf_offset, 0};
+            stmts += Declaration{global_store_data_offset, 0};
+            stmts += Declaration{global_store_transf_offset, 0};
+
+            stmts += Declaration{global_data_id, 0};
+            stmts += Declaration{global_transf_id, 0};
+        }
+
         // --------------------------------------------------
         // SBRC_2D
         // --------------------------------------------------
@@ -173,14 +201,40 @@
         offset_2d += Assign{offset_in, tile_index_in_plane * transforms_per_block * stride_load_in};
         offset_2d += Assign{offset_out, tile_index_in_plane * transforms_per_block * stride0_out};
 
-        offset_2d += For{d,
-                         2,
-                         d < dim,
-                         1,
-                         {Assign{num_of_tiles_in_batch, num_of_tiles_in_batch * lengths[d]},
-                          Assign{index_along_d, remaining % lengths[d]},
-                          Assign{remaining, remaining / lengths[d]},
-                          Assign{offset, offset + index_along_d * stride[d]}}};
+        if(emitGlobalId)
+        {
+            stmts += Declaration{Variable{"global_stride_in[3]", "const size_t"},
+                                 Literal{"{global_stride_in_0, global_stride_in_1, global_idist}"}};
+            stmts
+                += Declaration{Variable{"global_stride_out[3]", "const size_t"},
+                               Literal{"{global_stride_out_0, global_stride_out_1, global_odist}"}};
+
+            offset_2d += For{
+                d,
+                2,
+                d < dim,
+                1,
+                {Assign{num_of_tiles_in_batch, num_of_tiles_in_batch * lengths[d]},
+                 Assign{index_along_d, remaining % lengths[d]},
+                 Assign{remaining, remaining / lengths[d]},
+                 Assign{offset, offset + index_along_d * stride[d]},
+                 Assign{global_load_data_offset,
+                        global_load_data_offset + index_along_d * global_stride_in[d - 2]},
+                 Assign{global_store_data_offset,
+                        global_store_data_offset + index_along_d * global_stride_out[d - 2]}}};
+        }
+        else
+        {
+            offset_2d += For{d,
+                             2,
+                             d < dim,
+                             1,
+                             {Assign{num_of_tiles_in_batch, num_of_tiles_in_batch * lengths[d]},
+                              Assign{index_along_d, remaining % lengths[d]},
+                              Assign{remaining, remaining / lengths[d]},
+                              Assign{offset, offset + index_along_d * stride[d]}}};
+        }
+
         //
         // --------------------------------------------------
         // SBRC_3D
@@ -303,6 +357,23 @@
                                     thread_id % transforms_per_block}};
         }
 
+        if(emitGlobalId)
+        {
+            stmts += Assign{global_load_data_offset,
+                            global_load_data_offset
+                                + tile_index_in_plane * transforms_per_block * length
+                                + batch * global_stride_in[2]};
+            stmts += Assign{global_load_transf_offset,
+                            global_load_transf_offset
+                                + tile_index_in_plane * transforms_per_block * length};
+            stmts += Assign{global_store_data_offset,
+                            global_store_data_offset + tile_index_in_plane * transforms_per_block
+                                + batch * global_stride_out[2]};
+            stmts
+                += Assign{global_store_transf_offset,
+                          global_store_transf_offset + tile_index_in_plane * transforms_per_block};
+        }
+
         stmts += Declaration{edge, "false"};
         stmts += Declaration{thread};
         stmts += Declaration{tid_hor};
@@ -364,10 +435,27 @@
                     return ((thread_id + i * workgroup_size) % length) * stride_lds
                            + ((thread_id + i * workgroup_size) / length) * 1;
             };
+            auto offset_tile_tmp = [&](unsigned int i) {
+                if(divisible)
+                    return tid_hor + (thread + i * tid0_inc_step) * length;
+
+                else
+                    return ((thread_id + i * workgroup_size) % length)
+                           + ((thread_id + i * workgroup_size) / length) * length;
+            };
 
             StatementList regular_load;
+
             for(unsigned int i = 0; i < num_load_blocks; ++i)
             {
+                Expression tmp_idx = offset_tile_tmp(i);
+
+                if(emitGlobalId)
+                {
+                    regular_load += Assign{global_data_id, global_load_data_offset + tmp_idx};
+                    regular_load += Assign{global_transf_id, global_load_transf_offset + tmp_idx};
+                }
+
                 Expression buf_idx = offset_tile_rbuf(i);
                 Expression lds_idx = offset_tile_wlds(i);
                 if(direct_to_from_reg)
@@ -376,9 +464,11 @@
             }
 
             StatementList edge_load;
-            Variable      t{"t", "unsigned int"};
+
+            Variable t{"t", "unsigned int"};
             if(divisible)
             {
+                Expression tmp_idx = tid_hor + (thread + t) * length;
                 Expression buf_idx = tid_hor * stride0 + (thread + t) * stride_load_in;
                 Expression lds_idx = tid_hor * 1 + (thread + t) * stride_lds;
                 Expression pred
@@ -389,15 +479,32 @@
                                       tid_hor * 1 + (thread + t) * stride_lds,
                                       tid_hor * stride_lds + (thread + t) * 1};
                 }
-                edge_load
-                    += For{t,
-                           0,
-                           pred,
-                           tid0_inc_step,
-                           {Assign{lds_complex[lds_idx], LoadGlobal{buf, offset_in + buf_idx}}}};
+
+                if(emitGlobalId)
+                {
+                    edge_load += For{
+                        t,
+                        0,
+                        pred,
+                        tid0_inc_step,
+                        {Assign{global_data_id, global_load_data_offset + tmp_idx},
+                         Assign{global_transf_id, global_load_transf_offset + tmp_idx},
+                         Assign{lds_complex[lds_idx], LoadGlobal{buf, offset_in + buf_idx}}}};
+                }
+                else
+                {
+                    edge_load += For{
+                        t,
+                        0,
+                        pred,
+                        tid0_inc_step,
+                        {Assign{lds_complex[lds_idx], LoadGlobal{buf, offset_in + buf_idx}}}};
+                }
             }
             else
             {
+                Expression tmp_idx
+                    = ((thread_id + t) % length) + ((thread_id + t) / length) * length;
                 Expression buf_idx = ((thread_id + t) % length) * stride0
                                      + ((thread_id + t) / length) * stride_load_in;
                 Expression lds_idx
@@ -408,12 +515,27 @@
                         lds_linear,
                         ((thread_id + t) % length) * 1 + ((thread_id + t) / length) * stride_lds,
                         ((thread_id + t) % length) * stride_lds + ((thread_id + t) / length) * 1};
-                edge_load
-                    += For{t,
-                           0,
-                           pred,
-                           workgroup_size,
-                           {Assign{lds_complex[lds_idx], LoadGlobal{buf, offset_in + buf_idx}}}};
+
+                if(emitGlobalId)
+                {
+                    edge_load += For{
+                        t,
+                        0,
+                        pred,
+                        workgroup_size,
+                        {Assign{global_data_id, global_load_data_offset + tmp_idx},
+                         Assign{global_transf_id, global_load_transf_offset + tmp_idx},
+                         Assign{lds_complex[lds_idx], LoadGlobal{buf, offset_in + buf_idx}}}};
+                }
+                else
+                {
+                    edge_load += For{
+                        t,
+                        0,
+                        pred,
+                        workgroup_size,
+                        {Assign{lds_complex[lds_idx], LoadGlobal{buf, offset_in + buf_idx}}}};
+                }
             }
 
             stmts += If{Or{transpose_type != "TILE_UNALIGNED", Not{edge}}, regular_load};
@@ -476,10 +598,27 @@
                     return ((thread_id + i * workgroup_size) % store_block_w) * stride_lds
                            + ((thread_id + i * workgroup_size) / store_block_w) * 1;
             };
+            auto offset_tile_tmp = [&](unsigned int i) {
+                if(divisible)
+                    return tid_hor + (thread + i * tid0_inc_step) * lengths[1];
+                else
+                    return ((thread_id + i * workgroup_size) % store_block_w)
+                           + (thread + i * tid0_inc_step) * lengths[1];
+            };
 
             for(unsigned int i = 0; i < num_store_blocks; ++i)
+            {
+                if(emitGlobalId)
+                {
+                    regular_store
+                        += Assign{global_data_id, global_store_data_offset + offset_tile_tmp(i)};
+                    regular_store += Assign{global_transf_id,
+                                            global_store_transf_offset + offset_tile_tmp(i)};
+                }
+
                 regular_store += StoreGlobal{
                     buf, offset + offset_tile_wbuf(i), lds_complex[offset_tile_rlds(i)]};
+            };
 
             // ERC_Z_XY
             auto          i = num_store_blocks;
diff -Nru rocfft-5.5.0/library/src/device/generator.py rocfft-5.7.1/library/src/device/generator.py
--- rocfft-5.5.0/library/src/device/generator.py	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/generator.py	2023-08-09 16:19:51.000000000 +0000
@@ -792,14 +792,21 @@
     def emplace(self, key, value):
         return Call(self.name + '.emplace', arguments=ArgumentList(key, value))
 
-    def assert_emplace(self, key, value):
+    def assert_emplace(self, key, value, what_error):
         emplace = Call(self.name + '.emplace',
                        arguments=ArgumentList(key, value)).inline()
         status = Call(name='std::get<1>',
                       arguments=ArgumentList(emplace)).inline()
-        throw = StatementList(Throw('std::runtime_error("' + str(key) + '")'))
+        throw = StatementList(
+            Throw('std::runtime_error("' + str(what_error) + '")'))
         return If(Equal(status, "false"), throw)
 
+    def assert_insert(self, key, value):
+        insert = Call('insert_default_entry',
+                      arguments=ArgumentList(key, value)).inline()
+        throw = StatementList(Throw('std::runtime_error("' + str(key) + '")'))
+        return If(Equal(insert, "false"), throw)
+
     # def __getitem__(self, idx):
     #     return ArrayElement(self.name, idx)
 
@@ -879,20 +886,6 @@
         return f'using {self.name} = {self.spec};'
 
 
-@name_args(['name', 'arguments', 'templates', 'qualifier'])
-class Prototype(BaseNode):
-
-    def __str__(self) -> str:
-        f = ''
-        if self.templates:
-            f += 'template<' + str(self.templates) + '>'
-        if self.qualifier is not None:
-            f += self.qualifier + ' '
-        f += ' void ' + self.name
-        f += '(' + str(self.arguments) + ') FUNCTION_POOL_STANDALONE_BODY ;'
-        return f
-
-
 @name_args([
     'name', 'value', 'arguments', 'templates', 'qualifier', 'launch_bounds',
     'body', 'meta'
@@ -916,10 +909,6 @@
         f += '{' + njoin(self.body) + '}'
         return f
 
-    def prototype(self):
-        return Prototype(self.name, self.arguments, self.templates,
-                         self.qualifier)
-
     def address(self):
         return Address(self.name)
 
diff -Nru rocfft-5.5.0/library/src/device/kernel-generator-embed.h rocfft-5.7.1/library/src/device/kernel-generator-embed.h
--- rocfft-5.5.0/library/src/device/kernel-generator-embed.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/kernel-generator-embed.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,7 @@
 #include <unordered_map>
 #include <vector>
 
+extern const char* rocfft_complex_h;
 extern const char* common_h;
 extern const char* memory_gfx_h;
 extern const char* callback_h;
@@ -45,7 +46,6 @@
 extern const char* radix_13_h;
 extern const char* radix_16_h;
 extern const char* radix_17_h;
-extern const char* rtc_workarounds_h;
 
 const std::array<char, 32> generator_sum();
 
diff -Nru rocfft-5.5.0/library/src/device/kernel-generator.py rocfft-5.7.1/library/src/device/kernel-generator.py
--- rocfft-5.5.0/library/src/device/kernel-generator.py	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/kernel-generator.py	2023-08-09 16:19:51.000000000 +0000
@@ -43,7 +43,7 @@
 
 from generator import (ArgumentList, BaseNode, Call, CommentBlock, Function,
                        Include, LineBreak, Map, StatementList, Variable,
-                       name_args, write, clang_format_file)
+                       Assign, name_args, write)
 
 from collections import namedtuple
 
@@ -81,13 +81,6 @@
 #
 # Helpers
 #
-
-
-def flatten(lst):
-    """Flatten a list of lists to a list."""
-    return sum(lst, [])
-
-
 def unique(kernels):
     """Merge kernel lists without duplicated meta.length; ignore later ones."""
     r, s = list(), set()
@@ -123,8 +116,10 @@
             f += str(self.function.address())
         use_3steps_large_twd = getattr(self.function.meta,
                                        'use_3steps_large_twd', None)
+        # assume half-precision needs the same thing as single
+        precision = 'sp' if self.function.meta.precision == 'half' else self.function.meta.precision
         if use_3steps_large_twd is not None:
-            f += ', ' + str(use_3steps_large_twd[self.function.meta.precision])
+            f += ', ' + str(use_3steps_large_twd[precision])
         else:
             f += ', false'
         factors = getattr(self.function.meta, 'factors', None)
@@ -164,24 +159,27 @@
     function_map = Map('function_map')
     precisions = {
         'sp': 'rocfft_precision_single',
-        'dp': 'rocfft_precision_double'
+        'dp': 'rocfft_precision_double',
+        'half': 'rocfft_precision_half',
     }
+    var_kernel = Variable('kernel', 'FFTKernel')
 
     populate = StatementList()
+    populate += var_kernel.declaration()
     for f in functions:
         length, precision, scheme, transpose = f.meta.length, f.meta.precision, f.meta.scheme, f.meta.transpose
         if isinstance(length, (int, str)):
             length = [length, 0]
+        populate += Assign(var_kernel, FFTKernel(f))
         key = Call(name='std::make_tuple',
                    arguments=ArgumentList(
                        'std::array<size_t, 2>({' + cjoin(length) + '})',
-                       precisions[precision], scheme, transpose
-                       or 'NONE')).inline()
-        populate += function_map.assert_emplace(key, FFTKernel(f))
+                       precisions[precision], scheme, transpose or 'NONE',
+                       'kernel.get_kernel_config()')).inline()
+        populate += function_map.assert_insert(key, var_kernel)
 
     return StatementList(
         Include('"../include/function_pool.h"'),
-        StatementList(*[f.prototype() for f in functions]),
         Function(name='function_pool::function_pool',
                  value=False,
                  arguments=ArgumentList(),
@@ -642,7 +640,9 @@
         if not hasattr(k, 'length'):
             k.length = functools.reduce(lambda a, b: a * b, k.factors)
 
-    # SBRC
+    # for SBRC, if direct_to_from_reg is True, we do store-from-reg, but will not do load-to-reg
+    #           And since SBRC is is dir-from-lds but NOT dir-to-reg, the global load part requires full LDS
+    #           So, SBRC is able to use half-lds.
     sbrc_kernels = [
         NS(length=17,  factors=[17], scheme='CS_KERNEL_STOCKHAM_BLOCK_RC', workgroup_size=256, threads_per_transform=1, runtime_compile=True),
         NS(length=49,  factors=[7, 7], scheme='CS_KERNEL_STOCKHAM_BLOCK_RC', workgroup_size=196, threads_per_transform=7), # block_width=28
@@ -668,6 +668,9 @@
         NS(length=1331, factors=[11, 11, 11], scheme='CS_KERNEL_STOCKHAM_BLOCK_RC', workgroup_size=256, threads_per_transform=121, runtime_compile=True),
     ]
 
+    for k in sbrc_kernels:
+        k.half_lds = False
+
     # NB:
     # Technically, we could have SBCR kernels the same amount as SBCC.
     #
@@ -677,8 +680,8 @@
     #
 
     # for SBCR, if direct_to_from_reg is True, we do load-to-reg, but will not do store-from-reg
-    # TODO- tune on ROCm 5.2, SBCR 56 shows no improvement while it has on ROCm 5.0
-    #       Need to be aware of any compiler optimization / regression
+    #           And since sbcr is dir-to-reg BUT NOT dir-from-reg, the global store part requires full LDS
+    #           So, we can't satifly half_lds in SBCR !
     sbcr_kernels = [
         NS(length=56,  factors=[7, 8], direct_to_from_reg=False),
         NS(length=100, factors=[10, 10], workgroup_size=100),
@@ -689,11 +692,10 @@
     block_width = 16
     for k in sbcr_kernels:
         k.scheme = 'CS_KERNEL_STOCKHAM_BLOCK_CR'
+        k.half_lds = False
         if not hasattr(k, 'workgroup_size'):
             k.workgroup_size = block_width * \
                 functools.reduce(mul, k.factors, 1) // min(k.factors)
-        if hasattr(k, 'half_lds') and k.half_lds is True:
-            k.workgroup_size = min(1024, k.workgroup_size * 2)
         if not hasattr(k, 'length'):
             k.length = functools.reduce(lambda a, b: a * b, k.factors)
 
@@ -744,6 +746,10 @@
     # default half_lds to True only for CS_KERNEL_STOCKHAM
     half_lds = getattr(kernel, 'half_lds',
                        kernel.scheme == 'CS_KERNEL_STOCKHAM')
+    # but we don't use LDS for single-radix kernels, so half_lds is meaningless there
+    if len(kernel.factors) == 1:
+        half_lds = False
+
     # for unspecified direct_to_from_reg, default is True only for CS_KERNEL_STOCKHAM and SBCC
     direct_to_from_reg = getattr(kernel, 'direct_to_from_reg', True)
 
@@ -755,16 +761,15 @@
     args.append(kernel.scheme)
     args.append(filename)
 
-    proc = subprocess.Popen(args=args, stderr=subprocess.PIPE)
+    proc = subprocess.Popen(args=args)
     ret_code = proc.wait()
     if (ret_code != 0):
-        print(proc.stderr.read().decode('ascii'))
         sys.exit(f"Error executing " + stockham_aot)
 
     kernel_metadata_file = open(kernel_file_name(kernel) + '.json', 'r')
     launchers = json.load(kernel_metadata_file)
 
-    clang_format_file(filename)
+    # don't format generated source files since they aren't currently used
 
     cpu_functions = []
     data = Variable('data_p', 'const void *')
@@ -797,23 +802,27 @@
             threads_per_transform, 0
         ]
 
-        f = Function(name=launcher.name,
-                     arguments=ArgumentList(data, back),
-                     meta=NS(
-                         factors=factors,
-                         length=length,
-                         params=params,
-                         precision=precision,
-                         runtime_compile=runtime_compile,
-                         scheme=scheme,
-                         workgroup_size=workgroup_size,
-                         transforms_per_block=transforms_per_block,
-                         threads_per_transform=tpt_list,
-                         transpose=sbrc_transpose_type,
-                         use_3steps_large_twd=use_3steps_large_twd,
-                     ))
+        precisions = [precision]
+        if precision == 'sp':
+            precisions.append('half')
+        for p in precisions:
+            f = Function(name=launcher.name,
+                         arguments=ArgumentList(data, back),
+                         meta=NS(
+                             factors=factors,
+                             length=length,
+                             params=params,
+                             precision=p,
+                             runtime_compile=runtime_compile,
+                             scheme=scheme,
+                             workgroup_size=workgroup_size,
+                             transforms_per_block=transforms_per_block,
+                             threads_per_transform=tpt_list,
+                             transpose=sbrc_transpose_type,
+                             use_3steps_large_twd=use_3steps_large_twd,
+                         ))
 
-        cpu_functions.append(f)
+            cpu_functions.append(f)
 
     return cpu_functions
 
@@ -826,52 +835,12 @@
 
     A list of CPU functions is returned.
     """
-    import concurrent.futures
-    import queue
 
-    # push all the work to a queue
-    q_in = queue.Queue()
+    ret = []
     for k in kernels:
-        q_in.put(k)
-
-    # queue for outputs
-    q_out = queue.Queue()
-
-    def threadfunc():
-        nonlocal q_in
-        nonlocal q_out
-        nonlocal precisions
-        nonlocal stockham_aot
-        try:
-            while not q_in.empty():
-                k = q_in.get()
-                q_out.put(generate_kernel(k, precisions, stockham_aot))
-        except queue.Empty:
-            pass
-
-    # by default, start up worker threads.  disable this if you want
-    # to use pdb to debug
-    use_threads = True
-
-    if use_threads:
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            results = []
-            for i in range(os.cpu_count()):
-                results.append(executor.submit(threadfunc))
-            for result in results:
-                result.result()
-    else:
-        threadfunc()
-
-    # iterate over the queue
-    def queue_iter(q_out):
-        try:
-            while not q_out.empty():
-                yield q_out.get()
-        except queue.Empty:
-            pass
+        ret += generate_kernel(k, precisions, stockham_aot)
 
-    return flatten(queue_iter(q_out))
+    return ret
 
 
 def cli():
diff -Nru rocfft-5.5.0/library/src/device/kernels/array_format.h rocfft-5.7.1/library/src/device/kernels/array_format.h
--- rocfft-5.5.0/library/src/device/kernels/array_format.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/kernels/array_format.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -69,57 +69,72 @@
 };
 
 template <CallbackType cbtype>
-struct Handler<interleaved<float2>, cbtype>
+struct Handler<interleaved<rocfft_complex<float>>, cbtype>
 {
-    static __host__ __device__ inline float2
-        read(const interleaved<float2> in, size_t idx, void* load_cb_fn, void* load_cb_data)
+    static __host__ __device__ inline rocfft_complex<float>
+                    read(const interleaved<rocfft_complex<float>> in,
+                         size_t                                   idx,
+                         void*                                    load_cb_fn,
+                         void*                                    load_cb_data)
     {
-        auto load_cb = get_load_cb<float2, cbtype>(load_cb_fn);
+        auto load_cb = get_load_cb<rocfft_complex<float>, cbtype>(load_cb_fn);
         // callback might modify input, but it's otherwise const
-        return load_cb(const_cast<float2*>(in.C), idx, load_cb_data, nullptr);
+        return load_cb(const_cast<rocfft_complex<float>*>(in.C), idx, load_cb_data, nullptr);
     }
 
-    static __host__ __device__ inline void
-        write(interleaved<float2> out, size_t idx, float2 v, void* store_cb_fn, void* store_cb_data)
+    static __host__ __device__ inline void write(interleaved<rocfft_complex<float>> out,
+                                                 size_t                             idx,
+                                                 rocfft_complex<float>              v,
+                                                 void*                              store_cb_fn,
+                                                 void*                              store_cb_data)
     {
-        auto store_cb = get_store_cb<float2, cbtype>(store_cb_fn);
+        auto store_cb = get_store_cb<rocfft_complex<float>, cbtype>(store_cb_fn);
         store_cb(out.C, idx, v, store_cb_data, nullptr);
     }
 };
 
 template <CallbackType cbtype>
-struct Handler<interleaved<double2>, cbtype>
+struct Handler<interleaved<rocfft_complex<double>>, cbtype>
 {
-    static __host__ __device__ inline double2
-        read(const interleaved<double2> in, size_t idx, void* load_cb_fn, void* load_cb_data)
+    static __host__ __device__ inline rocfft_complex<double>
+                    read(const interleaved<rocfft_complex<double>> in,
+                         size_t                                    idx,
+                         void*                                     load_cb_fn,
+                         void*                                     load_cb_data)
     {
-        auto load_cb = get_load_cb<double2, cbtype>(load_cb_fn);
+        auto load_cb = get_load_cb<rocfft_complex<double>, cbtype>(load_cb_fn);
         // callback might modify input, but it's otherwise const
-        return load_cb(const_cast<double2*>(in.C), idx, load_cb_data, nullptr);
+        return load_cb(const_cast<rocfft_complex<double>*>(in.C), idx, load_cb_data, nullptr);
     }
 
-    static __host__ __device__ inline void write(
-        interleaved<double2> out, size_t idx, double2 v, void* store_cb_fn, void* store_cb_data)
+    static __host__ __device__ inline void write(interleaved<rocfft_complex<double>> out,
+                                                 size_t                              idx,
+                                                 rocfft_complex<double>              v,
+                                                 void*                               store_cb_fn,
+                                                 void*                               store_cb_data)
     {
-        auto store_cb = get_store_cb<double2, cbtype>(store_cb_fn);
+        auto store_cb = get_store_cb<rocfft_complex<double>, cbtype>(store_cb_fn);
         store_cb(out.C, idx, v, store_cb_data, nullptr);
     }
 };
 
 template <CallbackType cbtype>
-struct Handler<planar<float2>, cbtype>
+struct Handler<planar<rocfft_complex<float>>, cbtype>
 {
-    static __host__ __device__ inline float2
-        read(const planar<float2> in, size_t idx, void* load_cb_fn, void* load_cb_data)
+    static __host__ __device__ inline rocfft_complex<float> read(
+        const planar<rocfft_complex<float>> in, size_t idx, void* load_cb_fn, void* load_cb_data)
     {
-        float2 t;
+        rocfft_complex<float> t;
         t.x = in.R[idx];
         t.y = in.I[idx];
         return t;
     }
 
-    static __host__ __device__ inline void
-        write(planar<float2> out, size_t idx, float2 v, void* store_cb_fn, void* store_cb_data)
+    static __host__ __device__ inline void write(planar<rocfft_complex<float>> out,
+                                                 size_t                        idx,
+                                                 rocfft_complex<float>         v,
+                                                 void*                         store_cb_fn,
+                                                 void*                         store_cb_data)
     {
         out.R[idx] = v.x;
         out.I[idx] = v.y;
@@ -127,19 +142,22 @@
 };
 
 template <CallbackType cbtype>
-struct Handler<planar<double2>, cbtype>
+struct Handler<planar<rocfft_complex<double>>, cbtype>
 {
-    static __host__ __device__ inline double2
-        read(const planar<double2> in, size_t idx, void* load_cb_fn, void* load_cb_data)
+    static __host__ __device__ inline rocfft_complex<double> read(
+        const planar<rocfft_complex<double>> in, size_t idx, void* load_cb_fn, void* load_cb_data)
     {
-        double2 t;
+        rocfft_complex<double> t;
         t.x = in.R[idx];
         t.y = in.I[idx];
         return t;
     }
 
-    static __host__ __device__ inline void
-        write(planar<double2> out, size_t idx, double2 v, void* store_cb_fn, void* store_cb_data)
+    static __host__ __device__ inline void write(planar<rocfft_complex<double>> out,
+                                                 size_t                         idx,
+                                                 rocfft_complex<double>         v,
+                                                 void*                          store_cb_fn,
+                                                 void*                          store_cb_data)
     {
         out.R[idx] = v.x;
         out.I[idx] = v.y;
diff -Nru rocfft-5.5.0/library/src/device/kernels/callback.h rocfft-5.7.1/library/src/device/kernels/callback.h
--- rocfft-5.5.0/library/src/device/kernels/callback.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/kernels/callback.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -21,6 +21,7 @@
 #ifndef ROCFFT_DEVICE_CALLBACK_H
 #define ROCFFT_DEVICE_CALLBACK_H
 
+#include "../../../../shared/rocfft_complex.h"
 #include <hip/hip_vector_types.h>
 
 #include "memory_gfx.h"
@@ -55,47 +56,87 @@
 struct callback_type;
 
 template <>
-struct callback_type<float>
+struct callback_type<rocfft_complex<_Float16>>
 {
-    typedef float (*load)(float* data, size_t offset, void* cbdata, void* sharedMem);
-    typedef void (*store)(float* data, size_t offset, float element, void* cbdata, void* sharedMem);
+    typedef rocfft_complex<_Float16> (*load)(rocfft_complex<_Float16>* data,
+                                             size_t                    offset,
+                                             void*                     cbdata,
+                                             void*                     sharedMem);
+    typedef void (*store)(rocfft_complex<_Float16>* data,
+                          size_t                    offset,
+                          rocfft_complex<_Float16>  element,
+                          void*                     cbdata,
+                          void*                     sharedMem);
 };
 
-static __device__ auto load_cb_default_float  = load_cb_default<float>;
-static __device__ auto store_cb_default_float = store_cb_default<float>;
+static __device__ auto load_cb_default_complex_half  = load_cb_default<rocfft_complex<_Float16>>;
+static __device__ auto store_cb_default_complex_half = store_cb_default<rocfft_complex<_Float16>>;
 
 template <>
-struct callback_type<float2>
+struct callback_type<rocfft_complex<float>>
 {
-    typedef float2 (*load)(float2* data, size_t offset, void* cbdata, void* sharedMem);
-    typedef void (*store)(
-        float2* data, size_t offset, float2 element, void* cbdata, void* sharedMem);
+    typedef rocfft_complex<float> (*load)(rocfft_complex<float>* data,
+                                          size_t                 offset,
+                                          void*                  cbdata,
+                                          void*                  sharedMem);
+    typedef void (*store)(rocfft_complex<float>* data,
+                          size_t                 offset,
+                          rocfft_complex<float>  element,
+                          void*                  cbdata,
+                          void*                  sharedMem);
 };
 
-static __device__ auto load_cb_default_float2  = load_cb_default<float2>;
-static __device__ auto store_cb_default_float2 = store_cb_default<float2>;
+static __device__ auto load_cb_default_complex_float  = load_cb_default<rocfft_complex<float>>;
+static __device__ auto store_cb_default_complex_float = store_cb_default<rocfft_complex<float>>;
 
 template <>
-struct callback_type<double>
+struct callback_type<rocfft_complex<double>>
 {
-    typedef double (*load)(double* data, size_t offset, void* cbdata, void* sharedMem);
+    typedef rocfft_complex<double> (*load)(rocfft_complex<double>* data,
+                                           size_t                  offset,
+                                           void*                   cbdata,
+                                           void*                   sharedMem);
+    typedef void (*store)(rocfft_complex<double>* data,
+                          size_t                  offset,
+                          rocfft_complex<double>  element,
+                          void*                   cbdata,
+                          void*                   sharedMem);
+};
+
+static __device__ auto load_cb_default_complex_double  = load_cb_default<rocfft_complex<double>>;
+static __device__ auto store_cb_default_complex_double = store_cb_default<rocfft_complex<double>>;
+
+template <>
+struct callback_type<_Float16>
+{
+    typedef _Float16 (*load)(_Float16* data, size_t offset, void* cbdata, void* sharedMem);
     typedef void (*store)(
-        double* data, size_t offset, double element, void* cbdata, void* sharedMem);
+        _Float16* data, size_t offset, _Float16 element, void* cbdata, void* sharedMem);
 };
 
-static __device__ auto load_cb_default_double  = load_cb_default<double>;
-static __device__ auto store_cb_default_double = store_cb_default<double>;
+static __device__ auto load_cb_default_half  = load_cb_default<_Float16>;
+static __device__ auto store_cb_default_half = store_cb_default<_Float16>;
 
 template <>
-struct callback_type<double2>
+struct callback_type<float>
 {
-    typedef double2 (*load)(double2* data, size_t offset, void* cbdata, void* sharedMem);
+    typedef float (*load)(float* data, size_t offset, void* cbdata, void* sharedMem);
+    typedef void (*store)(float* data, size_t offset, float element, void* cbdata, void* sharedMem);
+};
+
+static __device__ auto load_cb_default_float  = load_cb_default<float>;
+static __device__ auto store_cb_default_float = store_cb_default<float>;
+
+template <>
+struct callback_type<double>
+{
+    typedef double (*load)(double* data, size_t offset, void* cbdata, void* sharedMem);
     typedef void (*store)(
-        double2* data, size_t offset, double2 element, void* cbdata, void* sharedMem);
+        double* data, size_t offset, double element, void* cbdata, void* sharedMem);
 };
 
-static __device__ auto load_cb_default_double2  = load_cb_default<double2>;
-static __device__ auto store_cb_default_double2 = store_cb_default<double2>;
+static __device__ auto load_cb_default_double  = load_cb_default<double>;
+static __device__ auto store_cb_default_double = store_cb_default<double>;
 
 // intrinsic
 template <typename T>
diff -Nru rocfft-5.5.0/library/src/device/kernels/common.h rocfft-5.7.1/library/src/device/kernels/common.h
--- rocfft-5.5.0/library/src/device/kernels/common.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/kernels/common.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -41,58 +41,64 @@
 #include "vector_types.h"
 #include <cuComplex.h>
 
-__device__ inline float2 operator-(const float2& a, const float2& b)
+__device__ inline rocfft_complex<float> operator-(const rocfft_complex<float>& a,
+                                                  const rocfft_complex<float>& b)
 {
-    return make_float2(a.x - b.x, a.y - b.y);
+    return rocfft_complex<float>(a.x - b.x, a.y - b.y);
 }
-__device__ inline float2 operator+(const float2& a, const float2& b)
+__device__ inline rocfft_complex<float> operator+(const rocfft_complex<float>& a,
+                                                  const rocfft_complex<float>& b)
 {
-    return make_float2(a.x + b.x, a.y + b.y);
+    return rocfft_complex<float>(a.x + b.x, a.y + b.y);
 }
-__device__ inline float2 operator*(const float& a, const float2& b)
+__device__ inline rocfft_complex<float> operator*(const float& a, const rocfft_complex<float>& b)
 {
-    return make_float2(a * b.x, a * b.y);
+    return rocfft_complex<float>(a * b.x, a * b.y);
 }
-__device__ inline float2 operator*=(float2& a, const float2& b)
+__device__ inline rocfft_complex<float> operator*=(rocfft_complex<float>&       a,
+                                                   const rocfft_complex<float>& b)
 {
     a = cuCmulf(a, b);
     return a;
 }
-__device__ inline float2 operator*=(float2& a, const float& b)
+__device__ inline rocfft_complex<float> operator*=(rocfft_complex<float>& a, const float& b)
 {
-    a = cuCmulf(a, make_float2(b, b));
+    a = cuCmulf(a, rocfft_complex<float>(b, b));
     return a;
 }
-__device__ inline float2 operator-(const float2& a)
+__device__ inline rocfft_complex<float> operator-(const rocfft_complex<float>& a)
 {
-    return cuCmulf(a, make_float2(-1.0, -1.0));
+    return cuCmulf(a, rocfft_complex<float>(-1.0, -1.0));
 }
 
-__device__ inline double2 operator-(const double2& a, const double2& b)
+__device__ inline rocfft_complex<double> operator-(const rocfft_complex<double>& a,
+                                                   const rocfft_complex<double>& b)
 {
-    return make_double2(a.x - b.x, a.y - b.y);
+    return rocfft_complex<double>(a.x - b.x, a.y - b.y);
 }
-__device__ inline double2 operator+(const double2& a, const double2& b)
+__device__ inline rocfft_complex<double> operator+(const rocfft_complex<double>& a,
+                                                   const rocfft_complex<double>& b)
 {
-    return make_double2(a.x + b.x, a.y + b.y);
+    return rocfft_complex<double>(a.x + b.x, a.y + b.y);
 }
-__device__ inline double2 operator*(const double& a, const double2& b)
+__device__ inline rocfft_complex<double> operator*(const double& a, const rocfft_complex<double>& b)
 {
-    return make_double2(a * b.x, a * b.y);
+    return rocfft_complex<double>(a * b.x, a * b.y);
 }
-__device__ inline double2 operator*=(double2& a, const double2& b)
+__device__ inline rocfft_complex<double> operator*=(rocfft_complex<double>&       a,
+                                                    const rocfft_complex<double>& b)
 {
     a = cuCmul(a, b);
     return a;
 }
-__device__ inline double2 operator*=(double2& a, const double& b)
+__device__ inline rocfft_complex<double> operator*=(rocfft_complex<double>& a, const double& b)
 {
-    a = cuCmul(a, make_double2(b, b));
+    a = cuCmul(a, rocfft_complex<double>(b, b));
     return a;
 }
-__device__ inline double2 operator-(const double2& a)
+__device__ inline rocfft_complex<double> operator-(const rocfft_complex<double>& a)
 {
-    return cuCmul(a, make_double2(-1.0, -1.0));
+    return cuCmul(a, rocfft_complex<double>(-1.0, -1.0));
 }
 
 #endif
@@ -103,11 +109,11 @@
     SB_NONUNIT,
 };
 
-enum class EmbeddedType
+enum class EmbeddedType : int
 {
-    NONE, // Works as the regular complex to complex FFT kernel
-    Real2C_POST, // Works with even-length real2complex post-processing
-    C2Real_PRE, // Works with even-length complex2real pre-processing
+    NONE        = 0, // Works as the regular complex to complex FFT kernel
+    Real2C_POST = 1, // Works with even-length real2complex post-processing
+    C2Real_PRE  = 2, // Works with even-length complex2real pre-processing
 };
 
 // TODO: rework this
@@ -135,11 +141,9 @@
 
 enum SBRC_TRANSPOSE_TYPE
 {
-    NONE,
-    // best, but requires cube sizes
-    DIAGONAL,
-    // OK, doesn't require handling unaligned corner case
-    TILE_ALIGNED,
+    NONE, // indicating this is a non-sbrc type, an SBRC kernel shouldn't have this
+    DIAGONAL, // best, but requires cube sizes
+    TILE_ALIGNED, // OK, doesn't require handling unaligned corner case
     TILE_UNALIGNED,
 };
 
@@ -158,53 +162,59 @@
     ENABLE_BOTH, // turn-on both intrinsic buffer load/store
 };
 
+enum BluesteinType
+{
+    BT_NONE,
+    BT_SINGLE_KERNEL, // implementation for small lengths (that fit in LDS)
+    BT_MULTI_KERNEL, // large lengths
+    BT_MULTI_KERNEL_FUSED, // large lengths with fused intermediate Bluestein operations
+};
+
+enum BluesteinFuseType
+{ // Fused operation types for multi-kernel Bluestein
+    BFT_NONE,
+    BFT_FWD_CHIRP, // fused chirp + padding + forward fft
+    BFT_FWD_CHIRP_MUL, // fused chirp / input Hadamard product + padding + forward fft
+    BFT_INV_CHIRP_MUL, // fused convolution Hadamard product + inverse fft + chirp Hadamard product
+};
+
 template <class T>
 struct real_type;
 
 template <>
-struct real_type<float4>
+struct real_type<rocfft_complex<float>>
 {
     typedef float type;
 };
 
 template <>
-struct real_type<double4>
+struct real_type<rocfft_complex<double>>
 {
     typedef double type;
 };
 
 template <>
-struct real_type<float2>
-{
-    typedef float type;
-};
-
-template <>
-struct real_type<double2>
+struct real_type<rocfft_complex<_Float16>>
 {
-    typedef double type;
+    typedef _Float16 type;
 };
 
 template <class T>
 using real_type_t = typename real_type<T>::type;
 
-/* example of using real_type_t */
-// real_type_t<float2> float_scalar;
-// real_type_t<double2> double_scalar;
-
 template <class T>
 struct complex_type;
 
 template <>
 struct complex_type<float>
 {
-    typedef float2 type;
+    typedef rocfft_complex<float> type;
 };
 
 template <>
 struct complex_type<double>
 {
-    typedef double2 type;
+    typedef rocfft_complex<double> type;
 };
 
 template <class T>
@@ -214,59 +224,6 @@
 // complex_type_t<float> float_complex_val;
 // complex_type_t<double> double_complex_val;
 
-template <class T>
-struct vector4_type;
-
-template <>
-struct vector4_type<float2>
-{
-    typedef float4 type;
-};
-
-template <>
-struct vector4_type<double2>
-{
-    typedef double4 type;
-};
-
-template <class T>
-using vector4_type_t = typename vector4_type<T>::type;
-
-/* example of using vector4_type_t */
-// vector4_type_t<float2> float4_scalar;
-// vector4_type_t<double2> double4_scalar;
-
-template <typename T>
-__device__ inline T lib_make_vector2(real_type_t<T> v0, real_type_t<T> v1);
-
-template <>
-__device__ inline float2 lib_make_vector2(float v0, float v1)
-{
-    return make_float2(v0, v1);
-}
-
-template <>
-__device__ inline double2 lib_make_vector2(double v0, double v1)
-{
-    return make_double2(v0, v1);
-}
-
-template <typename T>
-__device__ inline T
-    lib_make_vector4(real_type_t<T> v0, real_type_t<T> v1, real_type_t<T> v2, real_type_t<T> v3);
-
-template <>
-__device__ inline float4 lib_make_vector4(float v0, float v1, float v2, float v3)
-{
-    return make_float4(v0, v1, v2, v3);
-}
-
-template <>
-__device__ inline double4 lib_make_vector4(double v0, double v1, double v2, double v3)
-{
-    return make_double4(v0, v1, v2, v3);
-}
-
 template <typename T>
 __device__ T TWLstep1(const T* twiddles, size_t u)
 {
@@ -282,8 +239,8 @@
     T      result = twiddles[j];
     u >>= 8;
     j      = u & 255;
-    result = lib_make_vector2<T>((result.x * twiddles[256 + j].x - result.y * twiddles[256 + j].y),
-                                 (result.y * twiddles[256 + j].x + result.x * twiddles[256 + j].y));
+    result = T((result.x * twiddles[256 + j].x - result.y * twiddles[256 + j].y),
+               (result.y * twiddles[256 + j].x + result.x * twiddles[256 + j].y));
     return result;
 }
 
@@ -294,12 +251,12 @@
     T      result = twiddles[j];
     u >>= 8;
     j      = u & 255;
-    result = lib_make_vector2<T>((result.x * twiddles[256 + j].x - result.y * twiddles[256 + j].y),
-                                 (result.y * twiddles[256 + j].x + result.x * twiddles[256 + j].y));
+    result = T((result.x * twiddles[256 + j].x - result.y * twiddles[256 + j].y),
+               (result.y * twiddles[256 + j].x + result.x * twiddles[256 + j].y));
     u >>= 8;
     j      = u & 255;
-    result = lib_make_vector2<T>((result.x * twiddles[512 + j].x - result.y * twiddles[512 + j].y),
-                                 (result.y * twiddles[512 + j].x + result.x * twiddles[512 + j].y));
+    result = T((result.x * twiddles[512 + j].x - result.y * twiddles[512 + j].y),
+               (result.y * twiddles[512 + j].x + result.x * twiddles[512 + j].y));
     return result;
 }
 
@@ -310,16 +267,16 @@
     T      result = twiddles[j];
     u >>= 8;
     j      = u & 255;
-    result = lib_make_vector2<T>((result.x * twiddles[256 + j].x - result.y * twiddles[256 + j].y),
-                                 (result.y * twiddles[256 + j].x + result.x * twiddles[256 + j].y));
+    result = T((result.x * twiddles[256 + j].x - result.y * twiddles[256 + j].y),
+               (result.y * twiddles[256 + j].x + result.x * twiddles[256 + j].y));
     u >>= 8;
     j      = u & 255;
-    result = lib_make_vector2<T>((result.x * twiddles[512 + j].x - result.y * twiddles[512 + j].y),
-                                 (result.y * twiddles[512 + j].x + result.x * twiddles[512 + j].y));
+    result = T((result.x * twiddles[512 + j].x - result.y * twiddles[512 + j].y),
+               (result.y * twiddles[512 + j].x + result.x * twiddles[512 + j].y));
     u >>= 8;
     j      = u & 255;
-    result = lib_make_vector2<T>((result.x * twiddles[768 + j].x - result.y * twiddles[768 + j].y),
-                                 (result.y * twiddles[768 + j].x + result.x * twiddles[768 + j].y));
+    result = T((result.x * twiddles[768 + j].x - result.y * twiddles[768 + j].y),
+               (result.y * twiddles[768 + j].x + result.x * twiddles[768 + j].y));
     return result;
 }
 
diff -Nru rocfft-5.5.0/library/src/device/kernels/twiddle_factors.h rocfft-5.7.1/library/src/device/kernels/twiddle_factors.h
--- rocfft-5.5.0/library/src/device/kernels/twiddle_factors.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/device/kernels/twiddle_factors.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,127 +0,0 @@
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-#ifndef TWIDDLE_FACTORS_H
-#define TWIDDLE_FACTORS_H
-
-#include "common.h"
-
-static const unsigned int TWIDDLES_THREADS     = 32;
-static const unsigned int TWIDDLES_MAX_RADICES = 8;
-static constexpr double   TWO_PI               = -6.283185307179586476925286766559;
-
-// structure to pass fixed-length array of radices by value in
-// kernargs instead of by reference in global memory
-struct radices_t
-{
-    size_t data[TWIDDLES_MAX_RADICES];
-};
-
-template <typename T>
-__global__ void __launch_bounds__(TWIDDLES_THREADS* TWIDDLES_THREADS)
-    GenerateTwiddleTableKernel(size_t    length_limit,
-                               size_t    num_radices,
-                               radices_t radices,
-                               radices_t radices_prod,
-                               radices_t radices_sum_prod,
-                               T*        output)
-{
-    auto i = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if(i < num_radices - 1)
-    {
-        auto L     = radices_prod.data[i];
-        auto radix = radices.data[i + 1];
-        auto k     = threadIdx.y + blockIdx.y * blockDim.y;
-
-        if(k < L / radix)
-        {
-            double theta = TWO_PI * (k) / (L);
-            auto   index = radices_sum_prod.data[i] + k * (radices.data[i + 1] - 1);
-
-            for(size_t j = 1; j < radix && index < length_limit; ++j)
-            {
-                output[index].x = cos((j)*theta);
-                output[index].y = sin((j)*theta);
-
-                ++index;
-            }
-        }
-    }
-}
-
-template <typename T>
-__global__ void __launch_bounds__(TWIDDLES_THREADS)
-    GenerateTwiddleTableKernel(size_t length_limit, size_t N, T* output)
-{
-    auto i = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if(i < N && i < length_limit)
-    {
-        double c = cos(TWO_PI * i / N);
-        double s = sin(TWO_PI * i / N);
-
-        output[i].x = c;
-        output[i].y = s;
-    }
-}
-
-template <typename T>
-__global__ void __launch_bounds__(TWIDDLES_THREADS)
-    GenerateHalfNTableKernel(size_t half_N, size_t N, T* output)
-{
-    auto i = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if(i < half_N)
-    {
-        double c = cos(TWO_PI * i / (2 * N));
-        double s = sin(TWO_PI * i / (2 * N));
-
-        output[i].x = c;
-        output[i].y = s;
-    }
-}
-
-template <typename T>
-__global__ void __launch_bounds__(TWIDDLES_THREADS* TWIDDLES_THREADS)
-    GenerateTwiddleTableLargeKernel(double phi, size_t base, size_t X, size_t Y, T* output)
-{
-    auto iY = threadIdx.y + blockIdx.y * blockDim.y;
-
-    if(iY < Y)
-    {
-        auto iX = threadIdx.x + blockIdx.x * blockDim.x;
-
-        if(iX < X)
-        {
-            auto j = (static_cast<size_t>(1) << (iY * base)) * iX;
-
-            double c = cos(phi * j);
-            double s = sin(phi * j);
-
-            auto index = iY * X + iX;
-
-            output[index].x = c;
-            output[index].y = s;
-        }
-    }
-}
-
-#endif // TWIDDLE_FACTORS_H
diff -Nru rocfft-5.5.0/library/src/device/solution-shipping.py rocfft-5.7.1/library/src/device/solution-shipping.py
--- rocfft-5.5.0/library/src/device/solution-shipping.py	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/device/solution-shipping.py	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,302 @@
+#!/usr/bin/env python3
+# Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+"""rocFFT solution map library builder.
+
+"""
+
+import argparse
+import json
+from os import listdir
+from os.path import isfile, join
+from pathlib import Path
+from types import SimpleNamespace as NS
+
+from generator import (ArgumentList, BaseNode, Call, CommentLines, Function,
+                       Include, LineBreak, Map, StatementList, Throw, Variable,
+                       Assign, If, ReturnStatement, name_args, write)
+
+#
+# CMake helpers
+#
+
+
+def cjoin(xs):
+    """Join 'xs' with commas."""
+    return ','.join(str(x) for x in xs)
+
+
+# get gfx___ which is the prefix of the solution map
+def get_local_gpu_gfx(archs):
+    archs_gfx = []
+    for arch in archs:
+        gfx_target = arch.split(':')[0]
+        if gfx_target not in archs_gfx:
+            archs_gfx.append(gfx_target)
+
+    return archs_gfx
+
+
+#
+# Prototype generators
+#
+
+
+@name_args(['config', 'meta'])
+class KernelConfig(BaseNode):
+
+    def __str__(self):
+        kc = 'KernelConfig('
+
+        use_3steps = self.config['use_3steps']
+        kc += 'true' if (use_3steps is not None
+                         and use_3steps == True) else 'false'
+
+        factors = self.config['factors']
+        kc += ', {' + cjoin(
+            factors) + '}' if factors is not None else ', { 0 }'
+
+        tpb = self.config['tpb']
+        kc += ', ' + str(tpb) if tpb is not None else ', 0'
+
+        workgroup_size = self.config['wgs']
+        kc += ', ' + str(
+            workgroup_size) if workgroup_size is not None else ', 0'
+
+        tpt = self.config['tpt']
+        kc += ', {' + ','.join([str(s) for s in tpt
+                                ]) + '}' if tpt is not None else ', { 0 }'
+
+        half_lds = self.config['half_lds']
+        kc += ', true' if (half_lds is not None
+                           and half_lds == True) else ', false'
+
+        dir_reg = self.config['dir_reg']
+        kc += ', true' if (dir_reg is not None
+                           and dir_reg == True) else ', false'
+
+        buffer_inst = self.config['buffer_inst']
+        kc += ', true' if (buffer_inst is not None
+                           and buffer_inst == True) else ', false'
+
+        kc += ')'
+        return kc
+
+
+@name_args(['key', 'meta'])
+class FMKey(BaseNode):
+
+    def __str__(self):
+        k = 'fpkey('
+        lengths = self.key['lengths']
+        k += str(lengths[0])
+        if lengths[1] != 0:
+            k += ', ' + str(lengths[1])
+        k += ', StrToPrecision("' + str(self.key['precision']) + '")'
+        k += ', StrToComputeScheme("' + str(self.key['scheme']) + '")'
+        k += ', StrToSBRCTransType("' + str(self.key['sbrc_trans']) + '")'
+        k += ', ' + str(KernelConfig(self.key['kernelConfig']))
+        k += ')'
+        return k
+
+
+@name_args(['key', 'meta'])
+class SolutionPtr(BaseNode):
+
+    def __str__(self):
+        k = '{'
+        k += '"' + str(self.key['child_token']) + '"'
+        k += ',' + str(self.key['child_option'])
+        k += '}'
+        return k
+
+
+@name_args(['meta'])
+class SolutionNode(BaseNode):
+
+    def __str__(self):
+        sol = 'SolutionNode'
+        return sol
+
+
+def generate_solution_map(solutions):
+    """Generate function to populate the solution map."""
+
+    #
+    # add the solutions in the solution_map constructor
+    #
+    solution_nodes = Map('solution_nodes')
+    var_solution = Variable('solution', 'SolutionNode')
+
+    populate = StatementList()
+    populate += If('!rocfft_getenv("ROCFFT_USE_EMPTY_SOL_MAP").empty()',
+                   ReturnStatement())
+
+    if len(solutions) > 0:
+        populate += var_solution.declaration()
+        populate += LineBreak()
+
+    for sol in solutions:
+        arch, token, sol_node_type, scheme, kernel_key, childrens = \
+            sol.meta.arch, sol.meta.token, sol.meta.sol_type, \
+                sol.meta.scheme, sol.meta.kernel, sol.meta.childnodes
+
+        populate += LineBreak()
+        populate += CommentLines("add new solution")
+
+        # assigning solution data
+        populate += Assign(
+            str(var_solution) + '.sol_node_type',
+            'StrToSolutionNodeType("' + sol_node_type + '")')
+
+        # Check if it has .kernel_key field
+        populate += Assign(
+            str(var_solution) + '.kernel_key',
+            FMKey(kernel_key) if kernel_key is not None else 'EmptyFMKey')
+
+        # SOL_INTERNAL_NODE or SOL_LEAF_NODE or SOL_DUMMY
+        if scheme is not None:
+            populate += Assign(
+                str(var_solution) + '.using_scheme',
+                'StrToComputeScheme("' + str(scheme) + '")')
+        # SOL_KERNEL_ONLY
+        elif kernel_key is not None:
+            populate += Assign(
+                str(var_solution) + '.using_scheme',
+                'std::get<2>(' + str(var_solution) + '.kernel_key)')
+        # SOL_BUILTIN_KERNEL
+        else:
+            populate += Assign(str(var_solution) + '.using_scheme', 'CS_NONE')
+
+        populate += Assign(
+            str(var_solution) + '.solution_childnodes',
+            '{' + cjoin([SolutionPtr(s) for s in childrens]) + '}')
+
+        # ready to add to map
+        probKey = Call(name='ProblemKey',
+                       arguments=ArgumentList('"' + arch + '"',
+                                              '"' + token + '"')).inline()
+        populate += Call(name='add_solution_private',
+                         arguments=ArgumentList(probKey, var_solution))
+
+    return StatementList(
+        Include('"solution_map.h"'), Include('"../../shared/environment.h"'),
+        Function(name='solution_map::solution_map',
+                 value=False,
+                 arguments=ArgumentList(),
+                 body=populate), LineBreak())
+
+
+#
+# Main!
+#
+
+
+def generate_solutions(archs, folder):
+    solutions = []
+    path = Path(folder)
+
+    if not path.exists():
+        return solutions
+
+    all_files = [f for f in listdir(folder) if isfile(join(folder, f))]
+    # solution map file
+    target_files = [
+        join(folder, f) for f in all_files if f.split('_')[0] in archs
+    ]
+
+    # get solutions part
+    for file in target_files:
+        solution_map_file = open(file, 'r')
+        solution_map_data = json.load(solution_map_file)
+
+        # handle version
+        version = 'no version num'
+        if 'Version' in solution_map_data:
+            version = solution_map_data['Version']
+            all_solutions = solution_map_data['Data']
+        else:
+            all_solutions = solution_map_data
+
+        print('-- format version=' + str(version))
+
+        for entry_dict in all_solutions:
+
+            entry = NS(**entry_dict)
+            problemKey = entry.Problem
+            solutionVec = entry.Solutions
+
+            problem = NS(**problemKey)
+            arch = problem.arch
+            token = problem.token
+
+            # list of {}, {}, {}.... {} = solutionNode which is a dict
+            for solution_dict in solutionVec:
+                solution = NS(**solution_dict)
+                sol_node_type = solution.sol_node_type
+                using_scheme = getattr(solution, 'using_scheme', None)
+                kernel_key = getattr(solution, 'kernel_key', None)
+                childrens = getattr(solution, 'solution_childnodes', [])
+                s = SolutionNode(meta=NS(
+                    arch=str(arch),
+                    token=str(token),
+                    sol_type=str(sol_node_type),
+                    scheme=using_scheme,
+                    kernel=kernel_key,
+                    childnodes=childrens,
+                ))
+                solutions.append(s)
+
+    return solutions
+
+
+def cli():
+    """Command line interface..."""
+    parser = argparse.ArgumentParser(prog='solution-shipping')
+    parser.add_argument('--gpu-arch',
+                        type=str,
+                        help='Solutions of specific gpu arch')
+    parser.add_argument('--data-folder',
+                        type=str,
+                        help='Folder containing the solution map text files.')
+
+    args = parser.parse_args()
+
+    print('-- gpu_arch=' + args.gpu_arch)
+    print('-- data-folder=' + args.data_folder)
+
+    archs = args.gpu_arch.split(' ')
+    if 'all' in archs:
+        archs = [
+            'gfx900', 'gfx906', 'gfx908', 'gfx90a', 'gfx1030', 'gfx1100',
+            'gfx1101', 'gfx1102'
+        ]
+
+    # remove xnack and sramecc
+    archs = get_local_gpu_gfx(archs)
+    archs.append('any')
+
+    solutions = generate_solutions(archs, args.data_folder)
+
+    write('solutions.cpp', generate_solution_map(solutions), format=True)
+
+
+if __name__ == '__main__':
+    cli()
diff -Nru rocfft-5.5.0/library/src/enum_printer.cpp rocfft-5.7.1/library/src/enum_printer.cpp
--- rocfft-5.5.0/library/src/enum_printer.cpp	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/enum_printer.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,151 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "enum_printer.h"
+
+#include <map>
+#include <set>
+
+#define TO_STR2(x) #x
+#define TO_STR(x) TO_STR2(x)
+#define ENUMSTR(x) x, TO_STR(x)
+#define STRENUM(x) TO_STR(x), x
+
+static std::map<rocfft_precision, const char*> PrecisionToStrMap()
+{
+    std::map<rocfft_precision, const char*> PrecisionToStr = {{rocfft_precision_single, "single"},
+                                                              {rocfft_precision_double, "double"},
+                                                              {rocfft_precision_half, "half"}};
+    return PrecisionToStr;
+}
+
+static std::map<std::string, rocfft_precision> StrToPrecisionMap()
+{
+    std::map<std::string, rocfft_precision> StrToPrecision;
+    for(auto i : PrecisionToStrMap())
+        StrToPrecision.emplace(i.second, i.first);
+    return StrToPrecision;
+}
+
+static std::map<SBRC_TRANSPOSE_TYPE, const char*> SBRCTransTypetoStrMap()
+{
+    std::map<SBRC_TRANSPOSE_TYPE, const char*> SBRCTransTypeToStr = {
+        {ENUMSTR(NONE)}, {ENUMSTR(DIAGONAL)}, {ENUMSTR(TILE_ALIGNED)}, {ENUMSTR(TILE_UNALIGNED)}};
+    return SBRCTransTypeToStr;
+}
+
+static std::map<std::string, SBRC_TRANSPOSE_TYPE> StrToSBRCTransTypeMap()
+{
+    std::map<std::string, SBRC_TRANSPOSE_TYPE> StrToSBRCTransType;
+    for(auto i : SBRCTransTypetoStrMap())
+        StrToSBRCTransType.emplace(i.second, i.first);
+    return StrToSBRCTransType;
+}
+
+std::string PrintOperatingBuffer(const OperatingBuffer ob)
+{
+    static const std::map<OperatingBuffer, const char*> BuffertoString
+        = {{ENUMSTR(OB_UNINIT)},
+           {ENUMSTR(OB_USER_IN)},
+           {ENUMSTR(OB_USER_OUT)},
+           {ENUMSTR(OB_TEMP)},
+           {ENUMSTR(OB_TEMP_CMPLX_FOR_REAL)},
+           {ENUMSTR(OB_TEMP_BLUESTEIN)}};
+    return BuffertoString.at(ob);
+}
+
+std::string PrintOperatingBufferCode(const OperatingBuffer ob)
+{
+    static const std::map<OperatingBuffer, const char*> BuffertoString
+        = {{OB_UNINIT, "ERR"},
+           {OB_USER_IN, "A"},
+           {OB_USER_OUT, "B"},
+           {OB_TEMP, "T"},
+           {OB_TEMP_CMPLX_FOR_REAL, "C"},
+           {OB_TEMP_BLUESTEIN, "S"}};
+    return BuffertoString.at(ob);
+}
+
+std::string PrintOptimizeStrategy(const rocfft_optimize_strategy ros)
+{
+    static const std::map<rocfft_optimize_strategy, const char*> StrategytoString
+        = {{rocfft_optimize_min_buffer, "MINIMIZE_BUFFER"},
+           {rocfft_optimize_balance, "BALANCE_BUFFER_FUSION"},
+           {rocfft_optimize_max_fusion, "MAXIMIZE_FUSION"}};
+    return StrategytoString.at(ros);
+}
+
+std::string PrintSBRCTransposeType(const SBRC_TRANSPOSE_TYPE ty)
+{
+    static auto sbrc2strMap = SBRCTransTypetoStrMap();
+    return sbrc2strMap.at(ty);
+}
+
+std::string PrintDirectToFromRegMode(const DirectRegType ty)
+{
+    static const std::map<DirectRegType, const char*> TypetoString
+        = {{ENUMSTR(FORCE_OFF_OR_NOT_SUPPORT)}, {ENUMSTR(TRY_ENABLE_IF_SUPPORT)}};
+    return TypetoString.at(ty);
+}
+
+std::string PrintPrecision(const rocfft_precision pre)
+{
+    static auto precision2strMap = PrecisionToStrMap();
+    return precision2strMap.at(pre);
+}
+
+std::string PrintArrayType(const rocfft_array_type aryType)
+{
+    static const std::map<rocfft_array_type, const char*> aryTypeStr
+        = {{rocfft_array_type_complex_interleaved, "CI"},
+           {rocfft_array_type_complex_planar, "CP"},
+           {rocfft_array_type_real, "R"},
+           {rocfft_array_type_hermitian_interleaved, "HI"},
+           {rocfft_array_type_hermitian_planar, "HP"},
+           {rocfft_array_type_unset, "NA"}};
+    return aryTypeStr.at(aryType);
+}
+std::string PrintPlacement(const rocfft_result_placement placement)
+{
+    static const std::map<rocfft_result_placement, const char*> placementStr
+        = {{rocfft_placement_inplace, "IP"}, {rocfft_placement_notinplace, "OP"}};
+    return placementStr.at(placement);
+}
+std::string PrintEBType(const EmbeddedType ebtype)
+{
+    if(ebtype == EmbeddedType::NONE)
+        return std::string("NONE");
+    else if(ebtype == EmbeddedType::Real2C_POST)
+        return std::string("POST");
+    else
+        return std::string("PRE");
+}
+
+SBRC_TRANSPOSE_TYPE StrToSBRCTransType(const std::string& str)
+{
+    static auto str2sbrcMap = StrToSBRCTransTypeMap();
+    return str2sbrcMap.at(str);
+}
+
+rocfft_precision StrToPrecision(const std::string& str)
+{
+    static auto str2precisionMap = StrToPrecisionMap();
+    return str2precisionMap.at(str);
+}
\ No newline at end of file
diff -Nru rocfft-5.5.0/library/src/fuse_shim.cpp rocfft-5.7.1/library/src/fuse_shim.cpp
--- rocfft-5.5.0/library/src/fuse_shim.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/fuse_shim.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -41,8 +41,11 @@
     // sizes that do 7 rows seem to be slower for single.
     // TODO: the threshold may be set dependent one what kind of transport is the fused kernel
     //   eg. different value for TRANSPOSE, Z_XY, and XY_Z...
-    //   for example, 21504 -t 1 --double works quite good with minRows==2
-    size_t minRows = stockham->precision == rocfft_precision_single ? 8 : 4;
+    //   for example, 21504 -t 1 --precision double works quite good with minRows==2
+    size_t minRows = (stockham->precision == rocfft_precision_single
+                      || stockham->precision == rocfft_precision_half)
+                         ? 8
+                         : 4;
     return numTrans >= minRows;
 }
 
@@ -252,6 +255,7 @@
         fused->outStride[1] = transpose->outStride[2];
         fused->outStride[2] = transpose->outStride[0];
     }
+    fused->outputLength = transpose->outputLength;
 
     return fused;
 }
diff -Nru rocfft-5.5.0/library/src/include/assignment_policy.h rocfft-5.7.1/library/src/include/assignment_policy.h
--- rocfft-5.5.0/library/src/include/assignment_policy.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/assignment_policy.h	2023-08-09 16:19:51.000000000 +0000
@@ -114,12 +114,24 @@
 public:
     AssignmentPolicy() = default;
 
-    bool AssignBuffers(ExecPlan& execPlan);
+    void AssignBuffers(ExecPlan& execPlan);
 
     // pad temp buffers in a plan to avoid badly-performing strided accesses
     void PadPlan(ExecPlan& execPlan);
 
 private:
+    // Traverses the root plan tree to find nodes that can be run with the
+    // multi-kernel fused Bluestein algorithm
+    void FindBluesteinFusedNodes(ExecPlan& execPlan, std::vector<TreeNode*>& fusedNodes);
+
+    // Assign chirp buffers in fused multi-kernel Bluestein implementation.
+    // The first node in fused Bluestein is not connected to the rest
+    // of the nodes and, threfore, a separate run of AssignBuffers is need
+    // on the first node.
+    void AssignChirpBuffers(ExecPlan& execPlan);
+
+    void AssignBuffers_internal(ExecPlan& execPlan);
+
     static std::vector<size_t> GetEffectiveNodeOutLen(ExecPlan& execPlan, const TreeNode& node);
 
     // test if rootArrayType == testArrayType,
diff -Nru rocfft-5.5.0/library/src/include/chirp.h rocfft-5.7.1/library/src/include/chirp.h
--- rocfft-5.5.0/library/src/include/chirp.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/include/chirp.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,33 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#pragma once
+#if !defined(CHIRP_H)
+#define CHIRP_H
+
+#include "../../../shared/gpubuf.h"
+#include "rocfft.h"
+#include <vector>
+
+gpubuf
+    chirp_create(size_t N, rocfft_precision precision, const char* gpu_arch, unsigned int deviceId);
+
+void chirp_streams_cleanup();
+
+#endif // defined( CHIRP_H )
diff -Nru rocfft-5.5.0/library/src/include/compute_scheme.h rocfft-5.7.1/library/src/include/compute_scheme.h
--- rocfft-5.5.0/library/src/include/compute_scheme.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/compute_scheme.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -23,6 +23,9 @@
 
 #include <string>
 
+// TODO-
+// (PROB_DESCRIPTION << 24) | (KERNEL_ALGO << 16 ) |
+// (KERNEL_OR_DECOMPOSITION_TYPE << 8) | (KERNEL_LAYOUT) ????
 enum ComputeScheme
 {
     CS_NONE,
@@ -80,6 +83,10 @@
     CS_KERNEL_3D_SINGLE // not implemented yet
 };
 
-std::string PrintScheme(ComputeScheme cs);
+// print abbreviation for kernel scheme
+std::string   PrintKernelSchemeAbbr(ComputeScheme cs);
+std::string   PrintScheme(ComputeScheme cs);
+ComputeScheme StrToComputeScheme(const std::string& str);
+bool          ComputeSchemeIsAProblem(ComputeScheme cs);
 
 #endif
diff -Nru rocfft-5.5.0/library/src/include/data_descriptor.h rocfft-5.7.1/library/src/include/data_descriptor.h
--- rocfft-5.5.0/library/src/include/data_descriptor.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/include/data_descriptor.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,215 @@
+/******************************************************************************
+* Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*******************************************************************************/
+
+#ifndef DATA_DESCRIPTOR_H
+#define DATA_DESCRIPTOR_H
+
+#include <regex>
+#include <sstream>
+
+static inline std::string quote_str(const std::string& s)
+{
+    return "\"" + s + "\"";
+};
+
+template <typename T>
+struct ToString;
+
+template <typename T>
+struct VectorToString;
+
+template <typename T>
+struct FieldDescriptor;
+
+template <typename T>
+struct VectorFieldDescriptor;
+
+template <typename T>
+struct FromString;
+
+template <typename T>
+struct StringToVector;
+
+template <typename T>
+struct FieldParser;
+
+template <typename T>
+struct VectorFieldParser;
+
+template <typename T>
+struct ToString
+{
+    std::string print(const T& value) const
+    {
+        return std::to_string(value);
+    }
+};
+
+template <>
+struct ToString<bool>
+{
+    std::string print(const bool& value) const
+    {
+        return value ? std::string("true") : std::string("false");
+    }
+};
+
+template <>
+struct ToString<std::string>
+{
+    std::string print(const std::string& value) const
+    {
+        return quote_str(value);
+    }
+};
+
+template <typename T>
+struct VectorToString
+{
+    std::string print(const std::vector<T>& vec,
+                      bool                  elem_newline = false,
+                      const std::string&    indent       = "") const
+    {
+        const char* COMMA      = ",";
+        const char* LIST_DELIM = "";
+        std::string list_str   = "[ ";
+        for(auto i : vec)
+        {
+            list_str += LIST_DELIM;
+            list_str += ToString<T>().print(i);
+            LIST_DELIM = COMMA;
+
+            if(elem_newline)
+                list_str += "\n";
+            list_str += indent;
+        }
+        // a trick by adding space before the ']'
+        // by doing so, the ']' followed by a comma or '}' would be a individual
+        // token as a hint-key of the end of a vector.
+        list_str += " ]";
+        return list_str;
+    }
+};
+
+template <typename T>
+struct FieldDescriptor
+{
+    std::string describe(const std::string& key, const T& value) const
+    {
+        return quote_str(key) + ":" + ToString<T>().print(value);
+    }
+};
+
+template <typename T>
+struct VectorFieldDescriptor
+{
+    std::string describe(const std::string&    key,
+                         const std::vector<T>& vec,
+                         bool                  elem_newline = false,
+                         const std::string&    indent       = "") const
+    {
+        return quote_str(key) + ":" + VectorToString<T>().print(vec, elem_newline, indent);
+    }
+};
+
+template <>
+struct FromString<size_t>
+{
+    void Get(size_t& ret, std::sregex_token_iterator& current) const
+    {
+        ret = std::stoull(current->str());
+    }
+};
+
+template <>
+struct FromString<int>
+{
+    void Get(int& ret, std::sregex_token_iterator& current) const
+    {
+        ret = std::stoi(current->str());
+    }
+};
+
+template <>
+struct FromString<bool>
+{
+    void Get(bool& ret, std::sregex_token_iterator& current) const
+    {
+        ret = (current->str() == "true");
+    }
+};
+
+template <>
+struct FromString<std::string>
+{
+    void Get(std::string& ret, std::sregex_token_iterator& current) const
+    {
+        ret = current->str();
+    }
+};
+
+template <typename T>
+struct StringToVector
+{
+    void Get(std::vector<T>& ret, std::sregex_token_iterator& current) const
+    {
+        static const std::string hintKey("]");
+        while(current->str() != hintKey)
+        {
+            T elem;
+            FromString<T>().Get(elem, current);
+            ret.push_back(elem);
+            ++current;
+        }
+    }
+};
+
+template <typename T>
+struct FieldParser
+{
+    void parse(const std::string& expectedKey, T& value, std::sregex_token_iterator& current) const
+    {
+        while(current->str() != expectedKey)
+            ++current;
+
+        ++current;
+        FromString<T>().Get(value, current);
+    }
+};
+
+template <typename T>
+struct VectorFieldParser
+{
+    void parse(const std::string&          expectedKey,
+               std::vector<T>&             vec,
+               std::sregex_token_iterator& current) const
+    {
+        while(current->str() != expectedKey)
+            ++current;
+
+        ++current;
+        vec.clear();
+        StringToVector<T>().Get(vec, current);
+    }
+};
+
+#endif // DATA_DESCRIPTOR_H
diff -Nru rocfft-5.5.0/library/src/include/enum_printer.h rocfft-5.7.1/library/src/include/enum_printer.h
--- rocfft-5.5.0/library/src/include/enum_printer.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/include/enum_printer.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,61 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ENUM_PRINTER_H
+#define ENUM_PRINTER_H
+
+#include <string>
+
+#include "../../../shared/rocfft_complex.h"
+#include "../device/kernels/common.h"
+#include "rocfft.h"
+
+enum OperatingBuffer
+{
+    OB_UNINIT              = 0b00000,
+    OB_USER_IN             = 0b00001,
+    OB_USER_OUT            = 0b00010,
+    OB_TEMP                = 0b00100,
+    OB_TEMP_CMPLX_FOR_REAL = 0b01000,
+    OB_TEMP_BLUESTEIN      = 0b10000,
+};
+
+// the decision strategy for buffer assigment
+enum rocfft_optimize_strategy
+{
+    rocfft_optimize_min_buffer, // minimize number of buffers, possibly fewer fusions
+    rocfft_optimize_balance, // balance between buffer and fusion
+    rocfft_optimize_max_fusion, // maximize number of fusions, possibly more buffers
+};
+
+std::string PrintOperatingBuffer(const OperatingBuffer ob);
+std::string PrintOperatingBufferCode(const OperatingBuffer ob);
+std::string PrintOptimizeStrategy(const rocfft_optimize_strategy ros);
+std::string PrintDirectToFromRegMode(const DirectRegType ty);
+std::string PrintArrayType(const rocfft_array_type aryType);
+std::string PrintPlacement(const rocfft_result_placement placement);
+std::string PrintEBType(const EmbeddedType ebtype);
+std::string PrintSBRCTransposeType(const SBRC_TRANSPOSE_TYPE ty);
+std::string PrintPrecision(const rocfft_precision pre);
+
+SBRC_TRANSPOSE_TYPE StrToSBRCTransType(const std::string& str);
+rocfft_precision    StrToPrecision(const std::string& str);
+
+#endif
diff -Nru rocfft-5.5.0/library/src/include/function_map_key.h rocfft-5.7.1/library/src/include/function_map_key.h
--- rocfft-5.5.0/library/src/include/function_map_key.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/include/function_map_key.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,387 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef FUNCTION_MAP_KEY_H
+#define FUNCTION_MAP_KEY_H
+
+#include <array>
+#include <tuple>
+#include <vector>
+
+#include "compute_scheme.h"
+#include "data_descriptor.h"
+#include "enum_printer.h"
+#include "twiddles.h"
+
+struct KernelConfig
+{
+    bool                use_3steps_large_twd  = false;
+    bool                half_lds              = false;
+    bool                direct_to_from_reg    = false;
+    bool                intrinsic_buffer_inst = false;
+    unsigned int        transforms_per_block  = 0;
+    int                 workgroup_size        = 0;
+    std::array<int, 2>  threads_per_transform = {0, 0};
+    std::vector<size_t> factors               = {0};
+
+    KernelConfig()                    = default;
+    KernelConfig(const KernelConfig&) = default;
+
+    KernelConfig(bool                  use_3steps,
+                 std::vector<size_t>&& factors,
+                 int                   tpb,
+                 int                   wgs,
+                 std::array<int, 2>&&  tpt,
+                 bool                  half_lds              = false,
+                 bool                  direct_to_from_reg    = false,
+                 bool                  intrinsic_buffer_inst = false)
+        : use_3steps_large_twd(use_3steps)
+        , half_lds(half_lds)
+        , direct_to_from_reg(direct_to_from_reg)
+        , intrinsic_buffer_inst(intrinsic_buffer_inst)
+        , transforms_per_block(tpb)
+        , workgroup_size(wgs)
+        , threads_per_transform(tpt)
+        , factors(factors)
+    {
+    }
+
+    KernelConfig& operator=(const KernelConfig&) = default;
+
+    bool operator==(const KernelConfig& rhs) const
+    {
+        return std::tie(use_3steps_large_twd,
+                        half_lds,
+                        direct_to_from_reg,
+                        intrinsic_buffer_inst,
+                        transforms_per_block,
+                        workgroup_size,
+                        threads_per_transform,
+                        factors)
+               == std::tie(rhs.use_3steps_large_twd,
+                           rhs.half_lds,
+                           rhs.direct_to_from_reg,
+                           rhs.intrinsic_buffer_inst,
+                           rhs.transforms_per_block,
+                           rhs.workgroup_size,
+                           rhs.threads_per_transform,
+                           rhs.factors);
+    }
+
+    bool operator<(const KernelConfig& rhs) const
+    {
+        size_t l_h = std::hash<bool>{}(use_3steps_large_twd);
+        size_t r_h = std::hash<bool>{}(rhs.use_3steps_large_twd);
+        if(l_h > r_h)
+            return true;
+        if(l_h < r_h)
+            return false;
+
+        l_h = std::hash<bool>{}(half_lds);
+        r_h = std::hash<bool>{}(rhs.half_lds);
+        if(l_h > r_h)
+            return true;
+        if(l_h < r_h)
+            return false;
+
+        l_h = std::hash<bool>{}(direct_to_from_reg);
+        r_h = std::hash<bool>{}(rhs.direct_to_from_reg);
+        if(l_h > r_h)
+            return true;
+        if(l_h < r_h)
+            return false;
+
+        l_h = std::hash<bool>{}(intrinsic_buffer_inst);
+        r_h = std::hash<bool>{}(rhs.intrinsic_buffer_inst);
+        if(l_h > r_h)
+            return true;
+        if(l_h < r_h)
+            return false;
+
+        if(transforms_per_block > rhs.transforms_per_block)
+            return true;
+        if(transforms_per_block < rhs.transforms_per_block)
+            return false;
+
+        if(workgroup_size > rhs.workgroup_size)
+            return true;
+        if(workgroup_size < rhs.workgroup_size)
+            return false;
+
+        if(threads_per_transform > rhs.threads_per_transform)
+            return true;
+        if(threads_per_transform < rhs.threads_per_transform)
+            return false;
+
+        return (factors > rhs.factors);
+    }
+
+    std::string Print() const
+    {
+        std::stringstream ss;
+        ss << "KernelConfig: {";
+
+        ss << "3steps: " << (use_3steps_large_twd ? "true" : "false")
+           << ", half_lds: " << (half_lds ? "true" : "false")
+           << ", direct_reg: " << (direct_to_from_reg ? "true" : "false")
+           << ", try_use_buf_inst: " << (intrinsic_buffer_inst ? "true" : "false")
+           << ", tpb: " << transforms_per_block << ", wgs: " << workgroup_size << ", tpt: ["
+           << threads_per_transform[0] << "," << threads_per_transform[1] << "], factors: [";
+
+        std::string COMMA = "";
+        for(auto factor : factors)
+        {
+            ss << COMMA << factor;
+            COMMA = ", ";
+        }
+        ss << "]";
+
+        ss << "}";
+
+        return ss.str();
+    }
+
+    static KernelConfig EmptyConfig()
+    {
+        static KernelConfig empty;
+        return empty;
+    }
+};
+
+namespace std
+{
+    // hash function of FMKey requires the definition of hash function of kernel-config
+    template <>
+    struct hash<KernelConfig>
+    {
+        size_t operator()(const KernelConfig& config) const noexcept
+        {
+            size_t h = 0;
+            h ^= std::hash<bool>{}(config.use_3steps_large_twd);
+            h ^= std::hash<bool>{}(config.half_lds);
+            h ^= std::hash<bool>{}(config.direct_to_from_reg);
+            h ^= std::hash<bool>{}(config.intrinsic_buffer_inst);
+            h ^= std::hash<unsigned int>{}(config.transforms_per_block);
+            h ^= std::hash<int>{}(config.workgroup_size);
+            for(auto& v : config.threads_per_transform)
+                h ^= std::hash<int>{}(v);
+
+            // twiddle_factors defines a TWIDDLES_MAX_RADICES = 8
+            // which means the maximal factorization pass is 8
+            auto factors_max_len = config.factors;
+            factors_max_len.resize(TWIDDLES_MAX_RADICES);
+
+            for(auto& v : factors_max_len)
+                h ^= std::hash<size_t>{}(v);
+            return h;
+        }
+    };
+}
+
+// Implementing the ToString / FromString (data_descriptor.h)
+// for writing-to/reading-from texted-format solution map
+template <>
+struct ToString<KernelConfig>
+{
+    std::string print(const KernelConfig& value) const
+    {
+        std::string      str = "{";
+        std::vector<int> tpt = {value.threads_per_transform[0], value.threads_per_transform[1]};
+
+        str += FieldDescriptor<bool>().describe("use_3steps", value.use_3steps_large_twd) + ",";
+        str += FieldDescriptor<bool>().describe("half_lds", value.half_lds) + ",";
+        str += FieldDescriptor<bool>().describe("dir_reg", value.direct_to_from_reg) + ",";
+        str += FieldDescriptor<bool>().describe("buffer_inst", value.intrinsic_buffer_inst) + ",";
+        str += FieldDescriptor<unsigned int>().describe("tpb", value.transforms_per_block) + ",";
+        str += FieldDescriptor<int>().describe("wgs", value.workgroup_size) + ",";
+        str += VectorFieldDescriptor<int>().describe("tpt", tpt) + ",";
+        str += VectorFieldDescriptor<size_t>().describe("factors", value.factors);
+        str += "}";
+        return str;
+    }
+};
+
+template <>
+struct FromString<KernelConfig>
+{
+    void Get(KernelConfig& ret, std::sregex_token_iterator& current) const
+    {
+        std::vector<int> tpt;
+        size_t           tpb;
+
+        FieldParser<bool>().parse("use_3steps", ret.use_3steps_large_twd, current);
+        FieldParser<bool>().parse("half_lds", ret.half_lds, current);
+        FieldParser<bool>().parse("dir_reg", ret.direct_to_from_reg, current);
+        FieldParser<bool>().parse("buffer_inst", ret.intrinsic_buffer_inst, current);
+        FieldParser<size_t>().parse("tpb", tpb, current);
+        FieldParser<int>().parse("wgs", ret.workgroup_size, current);
+        VectorFieldParser<int>().parse("tpt", tpt, current);
+        VectorFieldParser<size_t>().parse("factors", ret.factors, current);
+
+        ret.transforms_per_block     = tpb;
+        ret.threads_per_transform[0] = tpt[0];
+        ret.threads_per_transform[1] = tpt[1];
+    }
+};
+
+// length, precision, scheme are theose fundemantal information of a kernel;
+// SBRC_TRANS is also neccessary for SBRC or SBRC_3D, but for non-SBRC, it is just NONE
+// And the newly added KernerlConfig is the key to supporting the "multi-configurations".
+// KernelConfig denotes what parameters we can alter to "generate and tune" a kernel
+//
+// NB:
+//    Since we didn't have the KernelConfig before, so, when getting the default kernels
+//    from the function_pool, the kernel_config "variable" would be a default EmptyConfig().
+//    But actually, the config is defined in the kernel-generator.py, so we are still able to
+//    know how the "EmptyConfig" can be mapped to a non-empty config (in kernel-gerator.py)
+//    (And that is what exactly "fuction_pool::insert_default_entry()" and
+//                               "function_pool::get_actual_key()"" is doing
+//
+// TODO:
+//    eventually, it would be better to implement the FMKey to struct for better readibility
+//
+using FMKey = std::tuple<std::array<size_t, 2>,
+                         rocfft_precision,
+                         ComputeScheme,
+                         SBRC_TRANSPOSE_TYPE,
+                         KernelConfig>;
+
+static inline FMKey fpkey(size_t              length,
+                          rocfft_precision    precision,
+                          ComputeScheme       scheme        = CS_KERNEL_STOCKHAM,
+                          SBRC_TRANSPOSE_TYPE transpose     = NONE,
+                          KernelConfig        kernel_config = KernelConfig::EmptyConfig())
+{
+    return {{length, 0}, precision, scheme, transpose, kernel_config};
+}
+
+static inline FMKey fpkey(size_t              length1,
+                          size_t              length2,
+                          rocfft_precision    precision,
+                          ComputeScheme       scheme        = CS_KERNEL_2D_SINGLE,
+                          SBRC_TRANSPOSE_TYPE transpose     = NONE,
+                          KernelConfig        kernel_config = KernelConfig::EmptyConfig())
+{
+    return {{length1, length2}, precision, scheme, transpose, kernel_config};
+}
+
+// add an alternative kernel with different kernel config from base FMKey
+static FMKey get_alternative_FMKey(const FMKey& base_FMKey, const KernelConfig& alt_config)
+{
+    const auto&               lengthVec = std::get<0>(base_FMKey);
+    const rocfft_precision    precision = std::get<1>(base_FMKey);
+    const ComputeScheme       scheme    = std::get<2>(base_FMKey);
+    const SBRC_TRANSPOSE_TYPE trans     = std::get<3>(base_FMKey);
+
+    return {lengthVec, precision, scheme, trans, alt_config};
+}
+
+static void GetKernelToken(const FMKey& key, std::string& min_token)
+{
+    const auto&            lengthVec = std::get<0>(key);
+    const rocfft_precision precision = std::get<1>(key);
+    const ComputeScheme    scheme    = std::get<2>(key);
+
+    min_token = "kernel";
+
+    min_token += "_len";
+    min_token += std::to_string(lengthVec[0]);
+    if(scheme == CS_KERNEL_2D_SINGLE)
+        min_token += "x" + std::to_string(lengthVec[1]);
+
+    min_token += "_" + PrintPrecision(precision);
+    min_token += "_" + PrintKernelSchemeAbbr(scheme);
+
+    // NB: KernelToken is used when tuning the kernel configuration,
+    //     But when we try different setting of TPB, the SBRCTransType
+    //     would not be the same value. So we should not keep the SBRCTransType
+    //     in the token, and all the SBRC kernels in that solution-vec may have
+    //     specify the real type.
+    // const SBRC_TRANSPOSE_TYPE transType = std::get<3>(key);
+    // min_token += "_" + PrintSBRCTransposeType(transType);
+}
+
+template <>
+struct ToString<FMKey>
+{
+    std::string print(const FMKey& value) const
+    {
+        std::string         str     = "{";
+        auto                len     = std::get<0>(value);
+        std::vector<size_t> lengths = {len[0], len[1]};
+
+        str += VectorFieldDescriptor<size_t>().describe("lengths", lengths) + ",";
+        str += FieldDescriptor<std::string>().describe("precision",
+                                                       PrintPrecision(std::get<1>(value)))
+               + ",";
+        str += FieldDescriptor<std::string>().describe("scheme", PrintScheme(std::get<2>(value)))
+               + ",";
+        str += FieldDescriptor<std::string>().describe("sbrc_trans",
+                                                       PrintSBRCTransposeType(std::get<3>(value)))
+               + ",";
+        str += FieldDescriptor<KernelConfig>().describe("kernelConfig", std::get<4>(value));
+        str += "}";
+        return str;
+    }
+};
+
+template <>
+struct FromString<FMKey>
+{
+    void Get(FMKey& ret, std::sregex_token_iterator& current) const
+    {
+        std::vector<size_t> len;
+        std::string         precStr, schemeStr, sbrcTransStr;
+        KernelConfig        config;
+
+        VectorFieldParser<size_t>().parse("lengths", len, current);
+        FieldParser<std::string>().parse("precision", precStr, current);
+        FieldParser<std::string>().parse("scheme", schemeStr, current);
+        FieldParser<std::string>().parse("sbrc_trans", sbrcTransStr, current);
+        FieldParser<KernelConfig>().parse("kernelConfig", config, current);
+
+        ret = {{len[0], len[1]},
+               StrToPrecision(precStr),
+               StrToComputeScheme(schemeStr),
+               StrToSBRCTransType(sbrcTransStr),
+               config};
+    }
+};
+
+static FMKey              EmptyFMKey    = {};
+static std::vector<FMKey> EmptyFMKeyVec = {};
+
+struct SimpleHash
+{
+    size_t operator()(const FMKey& p) const noexcept
+    {
+        size_t h = 0;
+        for(auto& v : std::get<0>(p))
+            h ^= std::hash<int>{}(v);
+        h ^= std::hash<rocfft_precision>{}(std::get<1>(p));
+        h ^= std::hash<ComputeScheme>{}(std::get<2>(p));
+        h ^= std::hash<SBRC_TRANSPOSE_TYPE>{}(std::get<3>(p));
+        h ^= std::hash<KernelConfig>{}(std::get<4>(p));
+
+        return h;
+    }
+};
+
+#endif
diff -Nru rocfft-5.5.0/library/src/include/function_pool.h rocfft-5.7.1/library/src/include/function_pool.h
--- rocfft-5.5.0/library/src/include/function_pool.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/function_pool.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 /******************************************************************************
-* Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+* Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -23,31 +23,12 @@
 #ifndef FUNCTION_POOL_H
 #define FUNCTION_POOL_H
 
+#include "../../../shared/rocfft_complex.h"
 #include "../device/kernels/common.h"
 #include "tree_node.h"
 #include <sstream>
 #include <unordered_map>
 
-using FMKey
-    = std::tuple<std::array<size_t, 2>, rocfft_precision, ComputeScheme, SBRC_TRANSPOSE_TYPE>;
-
-static inline FMKey fpkey(size_t              length,
-                          rocfft_precision    precision,
-                          ComputeScheme       scheme    = CS_KERNEL_STOCKHAM,
-                          SBRC_TRANSPOSE_TYPE transpose = NONE)
-{
-    return {{length, 0}, precision, scheme, transpose};
-}
-
-static inline FMKey fpkey(size_t              length1,
-                          size_t              length2,
-                          rocfft_precision    precision,
-                          ComputeScheme       scheme    = CS_KERNEL_2D_SINGLE,
-                          SBRC_TRANSPOSE_TYPE transpose = NONE)
-{
-    return {{length1, length2}, precision, scheme, transpose};
-}
-
 inline std::string PrintMissingKernelInfo(const FMKey& key)
 {
     const auto&               lengthVec = std::get<0>(key);
@@ -64,23 +45,8 @@
     return msg.str();
 }
 
-struct SimpleHash
-{
-    size_t operator()(const FMKey& p) const noexcept
-    {
-        size_t h = 0;
-        for(auto& v : std::get<0>(p))
-            h ^= std::hash<int>{}(v);
-        h ^= std::hash<rocfft_precision>{}(std::get<1>(p));
-        h ^= std::hash<ComputeScheme>{}(std::get<2>(p));
-        h ^= std::hash<SBRC_TRANSPOSE_TYPE>{}(std::get<3>(p));
-        return h;
-    }
-};
-
 struct FFTKernel
 {
-
     // generated launch function, which will be nullptr if the kernel
     // is built using runtime compilation
     DevFnCall           device_function = nullptr;
@@ -102,9 +68,9 @@
     // build time), using runtime compilation.
     bool aot_rtc = false;
 
-    FFTKernel() = delete;
-
+    FFTKernel()                 = default;
     FFTKernel(const FFTKernel&) = default;
+
     FFTKernel& operator=(const FFTKernel&) = default;
 
     FFTKernel(DevFnCall             fn,
@@ -127,16 +93,82 @@
         , aot_rtc(aot_rtc)
     {
     }
+
+    FFTKernel(const KernelConfig& config)
+        : factors(config.factors)
+        , transforms_per_block(config.transforms_per_block)
+        , workgroup_size(config.workgroup_size)
+        , threads_per_transform(config.threads_per_transform)
+        , use_3steps_large_twd(config.use_3steps_large_twd)
+        , half_lds(config.half_lds)
+        , direct_to_from_reg(config.direct_to_from_reg)
+    {
+    }
+
+    KernelConfig get_kernel_config() const
+    {
+        KernelConfig config;
+        config.transforms_per_block  = transforms_per_block;
+        config.workgroup_size        = workgroup_size;
+        config.threads_per_transform = threads_per_transform;
+        config.use_3steps_large_twd  = use_3steps_large_twd;
+        config.half_lds              = half_lds;
+        config.direct_to_from_reg    = direct_to_from_reg;
+        config.factors               = factors;
+
+        return config;
+    }
 };
 
 class function_pool
 {
+    // when AOT generator adds a default key-kernel,
+    // we get the keys of two version: empty-config vs full-config
+    // make the pair as an entry in a map so that we know they are the same things
+    std::unordered_map<FMKey, FMKey, SimpleHash>     def_key_pool;
     std::unordered_map<FMKey, FFTKernel, SimpleHash> function_map;
 
     ROCFFT_DEVICE_EXPORT function_pool();
 
+private:
+    static const FMKey& get_actual_key(const FMKey& key)
+    {
+        function_pool& func_pool = get_function_pool();
+
+        // - for keys that we are querying with no/empty kernel-config, actually we are refering to
+        //   the default kernel-configs in kernel-generator.py. So get the actual keys to look-up
+        //   the pool.
+        // - if not in the def_key_pool, then we simply use itself (for dynamically added kernel)
+        if(func_pool.def_key_pool.count(key) > 0)
+            return func_pool.def_key_pool.at(key);
+        else
+            return key;
+    }
+
+    // insert a key-kernel pair for AOT generator, this is a private function and can be called
+    // only in ctor. That is, the default kernel-config we set in the kernel-generator.py
+    // we save a pair as <key-empty-config, key-actual-config> that allows us to use
+    // the empty-config key to get the default kernel
+    bool insert_default_entry(const FMKey& def_key, const FFTKernel& kernel)
+    {
+        const auto&               lengthVec = std::get<0>(def_key);
+        const rocfft_precision    precision = std::get<1>(def_key);
+        const ComputeScheme       scheme    = std::get<2>(def_key);
+        const SBRC_TRANSPOSE_TYPE trans     = std::get<3>(def_key);
+
+        // simple_key means the same thing as def_key, but we just remove kernel-config
+        // so we don't need to know the exact config when we're lookin' for the default kernel
+        FMKey simple_key = fpkey(
+            lengthVec[0], lengthVec[1], precision, scheme, trans, KernelConfig::EmptyConfig());
+        def_key_pool.emplace(simple_key, def_key);
+
+        // still use the detailed key with config to maintain the function map
+        return std::get<1>(function_map.emplace(def_key, kernel));
+    }
+
 public:
     function_pool(const function_pool&) = delete;
+
     function_pool& operator=(const function_pool&) = delete;
 
     static function_pool& get_function_pool()
@@ -147,10 +179,47 @@
 
     ~function_pool() {}
 
+    // add a new kernel in runtime
+    static bool add_new_kernel(const FMKey& new_key)
+    {
+        // already has this kernel
+        if(has_function(new_key))
+            return true;
+
+        const KernelConfig& config = std::get<4>(new_key);
+        FFTKernel           new_kernel(config);
+
+        function_pool& func_pool = get_function_pool();
+        return std::get<1>(func_pool.function_map.emplace(new_key, new_kernel));
+    }
+
+    // add an alternative kernel with different kernel config from base FMKey
+    static bool add_alternative_kernel(const FMKey&            base_FMKey,
+                                       const KernelConfig&     alt_config,
+                                       std::unique_ptr<FMKey>& out_FMKey)
+    {
+        if(!has_function(base_FMKey))
+            return false;
+
+        FFTKernel alt_kernel(alt_config);
+
+        const auto&               lengthVec = std::get<0>(base_FMKey);
+        const rocfft_precision    precision = std::get<1>(base_FMKey);
+        const ComputeScheme       scheme    = std::get<2>(base_FMKey);
+        const SBRC_TRANSPOSE_TYPE trans     = std::get<3>(base_FMKey);
+
+        out_FMKey = std::make_unique<FMKey>(lengthVec, precision, scheme, trans, alt_config);
+
+        function_pool& func_pool = get_function_pool();
+        return std::get<1>(func_pool.function_map.emplace(*out_FMKey, alt_kernel));
+    }
+
     static bool has_function(const FMKey& key)
     {
         function_pool& func_pool = get_function_pool();
-        return func_pool.function_map.count(key) > 0;
+
+        auto real_key = function_pool::get_actual_key(key);
+        return func_pool.function_map.count(real_key) > 0;
     }
 
     static size_t get_largest_length(rocfft_precision precision)
@@ -181,13 +250,17 @@
     static DevFnCall get_function(const FMKey& key)
     {
         function_pool& func_pool = get_function_pool();
-        return func_pool.function_map.at(key).device_function;
+
+        auto real_key = function_pool::get_actual_key(key);
+        return func_pool.function_map.at(real_key).device_function;
     }
 
     static FFTKernel get_kernel(const FMKey& key)
     {
         function_pool& func_pool = get_function_pool();
-        return func_pool.function_map.at(key);
+
+        auto real_key = function_pool::get_actual_key(key);
+        return func_pool.function_map.at(real_key);
     }
 
     // helper for common used
@@ -196,9 +269,11 @@
         return has_function(fpkey(length, precision, CS_KERNEL_STOCKHAM_BLOCK_CC));
     }
 
-    static bool has_SBRC_kernel(size_t length, rocfft_precision precision)
+    static bool has_SBRC_kernel(size_t              length,
+                                rocfft_precision    precision,
+                                SBRC_TRANSPOSE_TYPE trans_type = TILE_ALIGNED)
     {
-        return has_function(fpkey(length, precision, CS_KERNEL_STOCKHAM_BLOCK_RC));
+        return has_function(fpkey(length, precision, CS_KERNEL_STOCKHAM_BLOCK_RC, trans_type));
     }
 
     static bool has_SBCR_kernel(size_t length, rocfft_precision precision)
diff -Nru rocfft-5.5.0/library/src/include/kernel_launch.h rocfft-5.7.1/library/src/include/kernel_launch.h
--- rocfft-5.5.0/library/src/include/kernel_launch.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/kernel_launch.h	2023-08-09 16:19:51.000000000 +0000
@@ -28,10 +28,10 @@
 #include "error.h"
 #endif
 #include "../../../shared/array_predicate.h"
+#include "../../../shared/rocfft_hip.h"
 #include "../device/kernels/callback.h"
 #include "kargs.h"
 #include "rocfft.h"
-#include "rocfft_hip.h"
 #include "tree_node.h"
 #include <iostream>
 
diff -Nru rocfft-5.5.0/library/src/include/logging.h rocfft-5.7.1/library/src/include/logging.h
--- rocfft-5.5.0/library/src/include/logging.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/logging.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 /******************************************************************************
-* Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+* Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -132,6 +132,7 @@
 extern int log_plan_fd;
 extern int log_kernelio_fd;
 extern int log_rtc_fd;
+extern int log_tuning_fd;
 
 /*! \brief Indicates if layer is active with bitmask*/
 typedef enum rocfft_layer_mode_
@@ -143,6 +144,7 @@
     rocfft_layer_mode_log_plan     = 0b0000001000, //  8
     rocfft_layer_mode_log_kernelio = 0b0000010000, // 16
     rocfft_layer_mode_log_rtc      = 0b0000100000, // 32
+    rocfft_layer_mode_log_tuning   = 0b0001000000, // 64
 } rocfft_layer_mode;
 
 class LogSingleton
@@ -213,6 +215,13 @@
         static thread_local rocfft_ostream log_rtc_os(log_rtc_fd);
         return &log_rtc_os;
     }
+    rocfft_ostream* GetTuningOS()
+    {
+        if(log_tuning_fd == -1)
+            return &rocfft_cerr;
+        static thread_local rocfft_ostream log_tuning_os(log_tuning_fd);
+        return &log_tuning_os;
+    }
 };
 
 #define LOG_TRACE_ENABLED() \
@@ -225,6 +234,8 @@
 #define LOG_KERNELIO_ENABLED() \
     (LogSingleton::GetInstance().GetLayerMode() & rocfft_layer_mode_log_kernelio)
 #define LOG_RTC_ENABLED() (LogSingleton::GetInstance().GetLayerMode() & rocfft_layer_mode_log_rtc)
+#define LOG_TUNING_ENABLED() \
+    (LogSingleton::GetInstance().GetLayerMode() & rocfft_layer_mode_log_tuning)
 
 // if profile logging is turned on with
 // (layer_mode & rocfft_layer_mode_log_profile) != 0
diff -Nru rocfft-5.5.0/library/src/include/node_factory.h rocfft-5.7.1/library/src/include/node_factory.h
--- rocfft-5.5.0/library/src/include/node_factory.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/node_factory.h	2023-08-09 16:19:51.000000000 +0000
@@ -39,10 +39,20 @@
     // Create node (user level) using this function
     static std::unique_ptr<TreeNode> CreateNodeFromScheme(ComputeScheme s,
                                                           TreeNode*     parent = nullptr);
-    static std::unique_ptr<TreeNode> CreateExplicitNode(NodeMetaData& nodeData, TreeNode* parent);
+    static std::unique_ptr<TreeNode> CreateExplicitNode(NodeMetaData& nodeData,
+                                                        TreeNode*     parent,
+                                                        ComputeScheme determined_scheme = CS_NONE);
 
+    // Checks if there exists native (radix) support for a given length. If
+    // no support exists for the given length, Bluestein algorithm is needed.
+    static bool SupportedLength(rocfft_precision precision, size_t len);
+
+    // Checks if  the non-pow2 length input is supported for a Bluestein compute scheme
     static bool NonPow2LengthSupported(rocfft_precision precision, size_t len);
 
+    // Gets a (potentially non-pow2) length to run Bluestein
+    static size_t GetBluesteinLength(rocfft_precision precision, size_t len);
+
     // Decide scheme from the node meta node
     static ComputeScheme DecideNodeScheme(NodeMetaData& nodeData, TreeNode* parent);
     static ComputeScheme DecideRealScheme(NodeMetaData& nodeData);
diff -Nru rocfft-5.5.0/library/src/include/option_util.h rocfft-5.7.1/library/src/include/option_util.h
--- rocfft-5.5.0/library/src/include/option_util.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/include/option_util.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,435 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+// This emulates the required functionality of boost::program_options
+
+#ifndef PROG_OPTION_UTIL_H
+#define PROG_OPTION_UTIL_H
+
+#include <cinttypes>
+#include <cstdio>
+#include <iomanip>
+#include <ostream>
+#include <regex>
+#include <set>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+// Regular expression for token delimiters (whitespace and commas)
+#define PROGRAM_OPTIONS_REGEX "[, \\f\\n\\r\\t\\v]+"
+#define VECTOR_DELIM ","
+
+// variables_map is a set of seen options
+using variables_map = std::set<std::string>;
+
+// Polymorphic base class to use with dynamic_cast
+class value_base
+{
+protected:
+    bool m_has_default = false;
+
+public:
+    bool has_default() const
+    {
+        return m_has_default;
+    }
+
+    virtual ~value_base() = default;
+};
+
+// Value parameters
+template <typename T>
+class value : public value_base
+{
+    T* m_var; // Pointer to variable to be modified
+
+public:
+    // Constructor
+    explicit value(T* var)
+        : m_var(var)
+    {
+    }
+
+    // Pointer to variable
+    T* get_ptr() const
+    {
+        return m_var;
+    }
+
+    // Allows default_value()
+    value* operator->()
+    {
+        return this;
+    }
+
+    // Set default value
+    value& default_value(T val)
+    {
+        *m_var        = std::move(val);
+        m_has_default = true;
+        return *this;
+    }
+};
+
+// bool_switch is a value<bool>, which is handled specially
+using bool_switch = value<bool>;
+
+class options_description
+{
+    // desc_option describes a particular option
+    class desc_option
+    {
+        std::string m_opts;
+        value_base* m_val;
+        std::string m_desc;
+
+    public:
+        // Constructor with options, value and description
+        template <typename T>
+        desc_option(std::string opts, value<T> val, std::string desc)
+            : m_opts(std::move(opts))
+            , m_val(new auto(std::move(val)))
+            , m_desc(std::move(desc))
+        {
+        }
+
+        // Constructor with options and description
+        desc_option(std::string opts, std::string desc)
+            : m_opts(std::move(opts))
+            , m_val(nullptr)
+            , m_desc(std::move(desc))
+        {
+        }
+
+        // Copy constructor is deleted
+        desc_option(const desc_option&) = delete;
+
+        // Move constructor
+        desc_option(desc_option&& other)
+            : m_opts(std::move(other.m_opts))
+            , m_val(other.m_val)
+            , m_desc(std::move(other.m_desc))
+        {
+            other.m_val = nullptr;
+        }
+
+        // Destructor
+        ~desc_option()
+        {
+            delete m_val;
+        }
+
+        // Accessors
+        const std::string& get_opts() const
+        {
+            return m_opts;
+        }
+
+        const value_base* get_val() const
+        {
+            return m_val;
+        }
+
+        const std::string& get_desc() const
+        {
+            return m_desc;
+        }
+
+        // Set a value
+        void set_val(int& argc, char**& argv) const
+        {
+            // We test all supported types with dynamic_cast and parse accordingly
+            bool match = false;
+            if(dynamic_cast<value<int32_t>*>(m_val))
+            {
+                auto* val = dynamic_cast<value<int32_t>*>(m_val)->get_ptr();
+                match     = argc && sscanf(*argv, "%" SCNd32, val) == 1;
+            }
+            else if(dynamic_cast<value<uint32_t>*>(m_val))
+            {
+                auto* val = dynamic_cast<value<uint32_t>*>(m_val)->get_ptr();
+                match     = argc && sscanf(*argv, "%" SCNu32, val) == 1;
+            }
+            else if(dynamic_cast<value<int64_t>*>(m_val))
+            {
+                auto* val = dynamic_cast<value<int64_t>*>(m_val)->get_ptr();
+                match     = argc && sscanf(*argv, "%" SCNd64, val) == 1;
+            }
+            else if(dynamic_cast<value<uint64_t>*>(m_val))
+            {
+                auto* val = dynamic_cast<value<uint64_t>*>(m_val)->get_ptr();
+                match     = argc && sscanf(*argv, "%" SCNu64, val) == 1;
+            }
+            else if(dynamic_cast<value<float>*>(m_val))
+            {
+                auto* val = dynamic_cast<value<float>*>(m_val)->get_ptr();
+                match     = argc && sscanf(*argv, "%f", val) == 1;
+            }
+            else if(dynamic_cast<value<double>*>(m_val))
+            {
+                auto* val = dynamic_cast<value<double>*>(m_val)->get_ptr();
+                match     = argc && sscanf(*argv, "%lf", val) == 1;
+            }
+            else if(dynamic_cast<value<char>*>(m_val))
+            {
+                auto* val = dynamic_cast<value<char>*>(m_val)->get_ptr();
+                match     = argc && sscanf(*argv, " %c", val) == 1;
+            }
+            else if(dynamic_cast<value<bool>*>(m_val))
+            {
+                // We handle bool specially, setting the value to true without argument
+                auto* val = dynamic_cast<value<bool>*>(m_val)->get_ptr();
+                *val      = true;
+                return;
+            }
+            else if(dynamic_cast<value<std::string>*>(m_val))
+            {
+                if(argc)
+                {
+                    *dynamic_cast<value<std::string>*>(m_val)->get_ptr() = *argv;
+                    match                                                = true;
+                }
+            }
+            else
+            {
+                throw std::logic_error("Internal error: Unsupported data type");
+            }
+
+            if(!match)
+                throw std::invalid_argument(argc ? *argv : "Missing required argument");
+
+            // Skip past the argument's value
+            ++argv;
+            --argc;
+        }
+    };
+
+    // Description and option list
+    std::string              m_desc;
+    std::vector<desc_option> m_optlist;
+
+    // desc_optionlist allows chains of options to be parenthesized
+    class desc_optionlist
+    {
+        std::vector<desc_option>& m_list;
+
+    public:
+        explicit desc_optionlist(std::vector<desc_option>& list)
+            : m_list(list)
+        {
+        }
+
+        template <typename... Ts>
+        desc_optionlist operator()(Ts&&... arg)
+        {
+            m_list.push_back(desc_option(std::forward<Ts>(arg)...));
+            return *this;
+        }
+    };
+
+public:
+    // Constructor
+    explicit options_description(std::string desc)
+        : m_desc(std::move(desc))
+    {
+    }
+
+    // Start a desc_optionlist chain
+    desc_optionlist add_options() &
+    {
+        return desc_optionlist(m_optlist);
+    }
+
+    // Parse an option at the current (argc, argv) position
+    void parse_option(int& argc, char**& argv, variables_map& vm, bool ignoreUnknown = false) const
+    {
+        static const std::regex program_options_regex{PROGRAM_OPTIONS_REGEX,
+                                                      std::regex_constants::optimize};
+
+        // Iterate across all options
+        for(const auto& opt : m_optlist)
+        {
+            // Canonical name used for map
+            std::string canonical_name;
+
+            // Iterate across tokens in the opts
+            for(std::sregex_token_iterator tok{
+                    opt.get_opts().begin(), opt.get_opts().end(), program_options_regex, -1};
+                tok != std::sregex_token_iterator();
+                ++tok)
+            {
+                // The first option in a list of options is the canonical name
+                if(!canonical_name.length())
+                    canonical_name = tok->str();
+
+                // If the length of the option is 1, it is single-dash; otherwise double-dash
+                const char* prefix = tok->length() == 1 ? "-" : "--";
+
+                // If option matches
+                if(*argv == prefix + tok->str())
+                {
+                    ++argv;
+                    --argc;
+
+                    // If option has a value, set it; otherwise indicate option in set
+                    if(opt.get_val())
+                        opt.set_val(argc, argv);
+                    // else
+                    vm.insert(canonical_name);
+                    return; // Return successfully
+                }
+            }
+        }
+
+        // No options were matched
+        if(ignoreUnknown)
+        {
+            ++argv;
+            --argc;
+        }
+        else
+            throw std::invalid_argument(*argv);
+    }
+
+    // Formatted output of command-line arguments description
+    friend std::ostream& operator<<(std::ostream& os, const options_description& d)
+    {
+        static const std::regex program_options_regex{PROGRAM_OPTIONS_REGEX,
+                                                      std::regex_constants::optimize};
+
+        // Iterate across all options
+        for(const auto& opt : d.m_optlist)
+        {
+            bool               first = true;
+            const char*        delim = "";
+            std::ostringstream left;
+
+            // Iterate across tokens in the opts
+            for(std::sregex_token_iterator tok{opt.get_opts().begin(),
+                                               opt.get_opts().end(),
+                                               program_options_regex,
+                                               -1};
+                tok != std::sregex_token_iterator();
+                ++tok, first = false, delim = " ")
+            {
+                // If the length of the option is 1, it is single-dash; otherwise double-dash
+                const char* prefix = tok->length() == 1 ? "-" : "--";
+                left << delim << (first ? "" : "[ ") << prefix << tok->str() << (first ? "" : " ]");
+            }
+
+            // Print the default value of the variable type if it exists
+            // We do not print the default value for bool
+            const value_base* val = opt.get_val();
+            if(val && !dynamic_cast<const value<bool>*>(val))
+            {
+                left << " arg";
+                if(val->has_default())
+                {
+                    // We test all supported types with dynamic_cast and print accordingly
+                    left << " (=";
+                    if(dynamic_cast<const value<int32_t>*>(val))
+                        left << *dynamic_cast<const value<int32_t>*>(val)->get_ptr();
+                    else if(dynamic_cast<const value<uint32_t>*>(val))
+                        left << *dynamic_cast<const value<uint32_t>*>(val)->get_ptr();
+                    else if(dynamic_cast<const value<int64_t>*>(val))
+                        left << *dynamic_cast<const value<int64_t>*>(val)->get_ptr();
+                    else if(dynamic_cast<const value<uint64_t>*>(val))
+                        left << *dynamic_cast<const value<uint64_t>*>(val)->get_ptr();
+                    else if(dynamic_cast<const value<float>*>(val))
+                        left << *dynamic_cast<const value<float>*>(val)->get_ptr();
+                    else if(dynamic_cast<const value<double>*>(val))
+                        left << *dynamic_cast<const value<double>*>(val)->get_ptr();
+                    else if(dynamic_cast<const value<char>*>(val))
+                        left << *dynamic_cast<const value<char>*>(val)->get_ptr();
+                    else if(dynamic_cast<const value<std::string>*>(val))
+                        left << *dynamic_cast<const value<std::string>*>(val)->get_ptr();
+                    else
+                        throw std::logic_error("Internal error: Unsupported data type");
+                    left << ")";
+                }
+            }
+            os << std::setw(36) << std::left << left.str() << " " << opt.get_desc() << "\n\n";
+        }
+        return os << std::flush;
+    }
+};
+
+// Class representing command line parser
+class parse_command_line
+{
+    variables_map m_vm;
+
+public:
+    parse_command_line(int                        argc,
+                       char**                     argv,
+                       const options_description& desc,
+                       bool                       ignoreUnknown = false)
+    {
+        ++argv; // Skip argv[0]
+        --argc;
+        while(argc)
+            desc.parse_option(argc, argv, m_vm, ignoreUnknown);
+    }
+
+    // Copy the variables_map
+    friend void store(const parse_command_line& p, variables_map& vm)
+    {
+        vm = p.m_vm;
+    }
+
+    // Move the variables_map
+    friend void store(parse_command_line&& p, variables_map& vm)
+    {
+        vm = std::move(p.m_vm);
+    }
+};
+
+// We can define the notify() function as a no-op for our purposes
+inline void notify(const variables_map&) {}
+
+void parse_arg_ints(std::string const& inStr, std::vector<size_t>& outVector)
+{
+    static const std::regex vector_delim{VECTOR_DELIM, std::regex_constants::optimize};
+
+    // std::cout << inStr << std::endl;
+    for(std::sregex_token_iterator tok{inStr.begin(), inStr.end(), vector_delim, -1};
+        tok != std::sregex_token_iterator();
+        ++tok)
+    {
+        outVector.push_back(std::stoi(tok->str()));
+    }
+}
+
+void parse_arg_strings(std::string const& inStr, std::vector<std::string>& outVector)
+{
+    static const std::regex vector_delim{VECTOR_DELIM, std::regex_constants::optimize};
+
+    // std::cout << inStr << std::endl;
+    for(std::sregex_token_iterator tok{inStr.begin(), inStr.end(), vector_delim, -1};
+        tok != std::sregex_token_iterator();
+        ++tok)
+    {
+        outVector.push_back(tok->str());
+    }
+}
+
+#endif // PROG_OPTION_UTIL_H
\ No newline at end of file
diff -Nru rocfft-5.5.0/library/src/include/plan.h rocfft-5.7.1/library/src/include/plan.h
--- rocfft-5.5.0/library/src/include/plan.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/plan.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -50,8 +50,8 @@
 
 struct rocfft_plan_description_t
 {
-    rocfft_array_type inArrayType  = rocfft_array_type_complex_interleaved;
-    rocfft_array_type outArrayType = rocfft_array_type_complex_interleaved;
+    rocfft_array_type inArrayType  = rocfft_array_type_unset;
+    rocfft_array_type outArrayType = rocfft_array_type_unset;
 
     std::array<size_t, 3> inStrides  = {0, 0, 0};
     std::array<size_t, 3> outStrides = {0, 0, 0};
@@ -65,6 +65,15 @@
     double scale_factor = 1.0;
 
     rocfft_plan_description_t() = default;
+
+    // A plan description is created in a vacuum and does not know what
+    // type of transform it will be for.  Once that's known, we can
+    // initialize default values for in/out type, stride, dist if they're
+    // unspecified.
+    void init_defaults(rocfft_transform_type        transformType,
+                       rocfft_result_placement      placement,
+                       size_t                       rank,
+                       const std::array<size_t, 3>& lengths);
 };
 
 struct rocfft_plan_t
@@ -83,8 +92,17 @@
     rocfft_plan_t() = default;
 
     ExecPlan execPlan;
+
+    // Users can provide lengths+strides in any order, but we'll
+    // construct the most sensible plans if they're in row-major order.
+    // Sort the FFT dimensions.
+    //
+    // This should be done when the plan parameters are known, but
+    // before we start creating any child nodes from the root plan.
+    void sort();
 };
 
 bool PlanPowX(ExecPlan& execPlan);
+bool GetTuningKernelInfo(ExecPlan& execPlan);
 
 #endif // PLAN_H
diff -Nru rocfft-5.5.0/library/src/include/ref_cpu.h rocfft-5.7.1/library/src/include/ref_cpu.h
--- rocfft-5.5.0/library/src/include/ref_cpu.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/ref_cpu.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -23,7 +23,6 @@
 
 #ifdef REF_DEBUG
 
-#include <complex>
 #include <cstdio>
 #include <dlfcn.h>
 #include <functional>
@@ -202,10 +201,10 @@
 
         size_t in_typesize  = (data->node->inArrayType == rocfft_array_type_real)
                                   ? sizeof(float)
-                                  : sizeof(std::complex<float>);
+                                  : sizeof(rocfft_complex<float>);
         size_t out_typesize = (data->node->outArrayType == rocfft_array_type_real)
                                   ? sizeof(float)
-                                  : sizeof(std::complex<float>);
+                                  : sizeof(rocfft_complex<float>);
 
         size_t     insize  = 0;
         size_t     outsize = 0;
@@ -232,13 +231,13 @@
         case CS_KERNEL_R_TO_CMPLX:
             insize      = batch * data->node->length[0];
             outsize     = batch * (data->node->length[0] + 1);
-            in_typesize = sizeof(std::complex<float>);
+            in_typesize = sizeof(rocfft_complex<float>);
             break;
         case CS_KERNEL_CMPLX_TO_R:
             insize       = batch * (data->node->length[0] + 1);
             outsize      = batch * data->node->length[0];
-            in_typesize  = sizeof(std::complex<float>);
-            out_typesize = sizeof(std::complex<float>);
+            in_typesize  = sizeof(rocfft_complex<float>);
+            out_typesize = sizeof(rocfft_complex<float>);
             break;
         default:
             insize  = std::accumulate(data->node->length.begin(),
@@ -262,10 +261,10 @@
 #if 0
         // Initialize the code to some known value to help debug the
         // cpu reference implementation.
-        std::complex<float>* input = (std::complex<float>*)fftwin.data;
+        rocfft_complex<float>* input = (rocfft_complex<float>*)fftwin.data;
         for(int r = 0; r < fftwin.size; ++r)
         {
-            input[r] = std::complex<float>(r + 0.5, r * r + 3);
+            input[r] = rocfft_complex<float>(r + 0.5, r * r + 3);
         }
 #endif
     }
@@ -344,7 +343,7 @@
         }
 
         void*   buf = ((char*)data->bufIn[0] + offset);
-        fftwbuf tmp_mem(data->node->iDist * data->node->batch, sizeof(std::complex<float>));
+        fftwbuf tmp_mem(data->node->iDist * data->node->batch, sizeof(rocfft_complex<float>));
         hipMemcpy(tmp_mem.data, buf, in_size_bytes, hipMemcpyDeviceToHost);
 
         CopyVector((local_fftwf_complex*)fftwin.data,
@@ -355,11 +354,14 @@
                    data->node->inStride);
     }
 
-    inline static float2
-        TwMul(float2* twiddles, const size_t twl, const int direction, float2 val, size_t u)
+    inline static rocfft_complex<float> TwMul(rocfft_complex<float>* twiddles,
+                                              const size_t           twl,
+                                              const int              direction,
+                                              rocfft_complex<float>  val,
+                                              size_t                 u)
     {
-        size_t j      = u & 255;
-        float2 result = twiddles[j];
+        size_t                j      = u & 255;
+        rocfft_complex<float> result = twiddles[j];
 
         float  real, imag;
         size_t h = 1;
@@ -540,8 +542,8 @@
         case CS_KERNEL_TRANSPOSE:
         {
             // TODO: what about the real transpose case?
-            std::complex<float>* ot = (std::complex<float>*)fftwout.data;
-            std::complex<float>* in = (std::complex<float>*)fftwin.data;
+            rocfft_complex<float>* ot = (rocfft_complex<float>*)fftwout.data;
+            rocfft_complex<float>* in = (rocfft_complex<float>*)fftwin.data;
 
             CopyInputVector(data_p);
 
@@ -567,9 +569,9 @@
             }
             else
             {
-                float2*                   twtc;
-                size_t                    ns = 0;
-                TwiddleTableLarge<float2> twTable(data->node->large1D);
+                rocfft_complex<float>*                   twtc;
+                size_t                                   ns = 0;
+                TwiddleTableLarge<rocfft_complex<float>> twTable(data->node->large1D);
                 std::tie(ns, twtc) = twTable.GenerateTwiddleTable();
 
                 int twl = 0;
@@ -591,7 +593,7 @@
                     {
                         for(size_t j = 0; j < cols; j++)
                         {
-                            float2 in_v, ot_v;
+                            rocfft_complex<float> in_v, ot_v;
 
                             in_v.x = in[b * rows * cols + i * cols + j].real();
                             in_v.y = in[b * rows * cols + i * cols + j].imag();
@@ -608,14 +610,14 @@
         break;
         case CS_KERNEL_COPY_R_TO_CMPLX:
         {
-            std::complex<float>* ot = (std::complex<float>*)fftwout.data;
-            size_t in_size_bytes    = (data->node->iDist * data->node->batch) * sizeof(float);
+            rocfft_complex<float>* ot = (rocfft_complex<float>*)fftwout.data;
+            size_t in_size_bytes      = (data->node->iDist * data->node->batch) * sizeof(float);
 
-            fftwbuf tmp_mem(data->node->iDist * data->node->batch, sizeof(std::complex<float>));
+            fftwbuf tmp_mem(data->node->iDist * data->node->batch, sizeof(rocfft_complex<float>));
 
             hipMemcpy(tmp_mem.data, data->bufIn[0], in_size_bytes, hipMemcpyDeviceToHost);
 
-            std::complex<float>* tmp_data = (std::complex<float>*)tmp_mem.data;
+            rocfft_complex<float>* tmp_data = (rocfft_complex<float>*)tmp_mem.data;
 
             size_t elements = 1;
             for(size_t d = 0; d < data->node->length.size(); d++)
@@ -627,23 +629,23 @@
                 for(size_t i = 0; i < elements; i++)
                 {
                     ot[data->node->oDist * b + i]
-                        = std::complex<float>(tmp_data[data->node->iDist * b + i].real(), 0.0);
+                        = rocfft_complex<float>(tmp_data[data->node->iDist * b + i].real(), 0.0);
                 }
             }
         }
         break;
         case CS_KERNEL_COPY_CMPLX_TO_HERM:
         {
-            std::complex<float>* ot = (std::complex<float>*)fftwout.data;
+            rocfft_complex<float>* ot = (rocfft_complex<float>*)fftwout.data;
             // assump the input is complex, the output is hermitian on take the first
             // [N/2 + 1] elements
             size_t in_size_bytes = (data->node->iDist * data->node->batch) * 2 * sizeof(float);
 
-            fftwbuf tmp_mem(data->node->iDist * data->node->batch, sizeof(std::complex<float>));
+            fftwbuf tmp_mem(data->node->iDist * data->node->batch, sizeof(rocfft_complex<float>));
 
             hipMemcpy(tmp_mem.data, data->bufIn[0], in_size_bytes, hipMemcpyDeviceToHost);
 
-            std::complex<float>* tmp_data = (std::complex<float>*)tmp_mem.data;
+            rocfft_complex<float>* tmp_data = (rocfft_complex<float>*)tmp_mem.data;
 
             size_t elements = 1;
             elements *= data->node->length[0] / 2 + 1;
@@ -665,16 +667,16 @@
         break;
         case CS_KERNEL_COPY_HERM_TO_CMPLX:
         {
-            std::complex<float>* ot = (std::complex<float>*)fftwout.data;
+            rocfft_complex<float>* ot = (rocfft_complex<float>*)fftwout.data;
             // assump the input is hermitian, the output is complex on take the first
             // [N/2 + 1] elements
             size_t in_size_bytes = (data->node->iDist * data->node->batch) * 2 * sizeof(float);
 
-            fftwbuf tmp_mem(data->node->iDist * data->node->batch, sizeof(std::complex<float>));
+            fftwbuf tmp_mem(data->node->iDist * data->node->batch, sizeof(rocfft_complex<float>));
 
             hipMemcpy(tmp_mem.data, data->bufIn[0], in_size_bytes, hipMemcpyDeviceToHost);
 
-            std::complex<float>* tmp_data = (std::complex<float>*)tmp_mem.data;
+            rocfft_complex<float>* tmp_data = (rocfft_complex<float>*)tmp_mem.data;
 
             size_t output_size = data->node->length[0];
             size_t input_size  = output_size / 2 + 1;
@@ -713,14 +715,14 @@
             assert(fftwin.size == batch * halfN);
             assert(fftwout.size == batch * (halfN + 1));
 
-            const auto           input  = (std::complex<float>*)fftwin.data;
-            std::complex<float>* output = (std::complex<float>*)fftwout.data;
+            const auto             input  = (rocfft_complex<float>*)fftwin.data;
+            rocfft_complex<float>* output = (rocfft_complex<float>*)fftwout.data;
 
             size_t output_idx_base = 0;
 
-            const std::complex<float> I(0, 1);
-            const std::complex<float> one(1, 0);
-            const std::complex<float> half(0.5, 0);
+            const rocfft_complex<float> I(0, 1);
+            const rocfft_complex<float> one(1, 0);
+            const rocfft_complex<float> half(0.5, 0);
 
             const float overN = 0.5 / halfN;
 
@@ -728,17 +730,17 @@
             {
                 const auto bin  = input + ibatch * halfN;
                 auto       bout = output + ibatch * (halfN + 1);
-                bout[0]         = std::complex<float>(bin[0].real() + bin[0].imag());
+                bout[0]         = rocfft_complex<float>(bin[0].real() + bin[0].imag());
                 for(int r = 1; r < halfN; ++r)
                 {
                     const auto omegaNr
-                        = std::exp(std::complex<float>(0.0f, (float)(-2.0f * M_PI * r * overN)));
+                        = std::exp(rocfft_complex<float>(0.0f, (float)(-2.0f * M_PI * r * overN)));
                     bout[r] = bin[r] * half * (one - I * omegaNr)
                               + conj(bin[halfN - r]) * half * (one + I * omegaNr);
                 }
             }
             output[output_idx_base + halfN]
-                = std::complex<float>(input[0].real() - input[0].imag(), 0);
+                = rocfft_complex<float>(input[0].real() - input[0].imag(), 0);
         }
         break;
         case CS_KERNEL_CMPLX_TO_R:
@@ -749,15 +751,15 @@
 
             assert(fftwin.size == batch * (halfN + 1));
             assert(fftwout.size == batch * halfN);
-            assert(fftwin.typesize == sizeof(std::complex<float>));
-            assert(fftwout.typesize == sizeof(std::complex<float>));
+            assert(fftwin.typesize == sizeof(rocfft_complex<float>));
+            assert(fftwout.typesize == sizeof(rocfft_complex<float>));
 
-            const std::complex<float>* input  = (std::complex<float>*)fftwin.data;
-            std::complex<float>*       output = (std::complex<float>*)fftwout.data;
+            const rocfft_complex<float>* input  = (rocfft_complex<float>*)fftwin.data;
+            rocfft_complex<float>*       output = (rocfft_complex<float>*)fftwout.data;
 
-            const float               overN = 0.5 / halfN;
-            const std::complex<float> I(0, 1);
-            const std::complex<float> one(1, 0);
+            const float                 overN = 0.5 / halfN;
+            const rocfft_complex<float> I(0, 1);
+            const rocfft_complex<float> one(1, 0);
 
             for(int ibatch = 0; ibatch < batch; ++ibatch)
             {
@@ -765,7 +767,7 @@
                 auto       bout = output + ibatch * halfN;
                 for(int r = 0; r < halfN; ++r)
                 {
-                    const auto omegaNr = std::exp(std::complex<float>(0, 2.0 * M_PI * r * overN));
+                    const auto omegaNr = std::exp(rocfft_complex<float>(0, 2.0 * M_PI * r * overN));
                     bout[r]
                         = bin[r] * (one + I * omegaNr) + conj(bin[halfN - r]) * (one - I * omegaNr);
                 }
@@ -781,8 +783,8 @@
         break;
         case CS_KERNEL_PAD_MUL:
         {
-            std::complex<float>* in = (std::complex<float>*)fftwin.data;
-            std::complex<float>* ot = (std::complex<float>*)fftwout.data;
+            rocfft_complex<float>* in = (rocfft_complex<float>*)fftwin.data;
+            rocfft_complex<float>* ot = (rocfft_complex<float>*)fftwout.data;
             CopyInputVector(data_p);
 
             size_t howmany = data->node->batch;
@@ -792,11 +794,11 @@
             size_t N = data->node->length[0];
             size_t M = data->node->lengthBlue;
 
-            fftwbuf chirp_mem(M * 2, sizeof(std::complex<float>));
+            fftwbuf chirp_mem(M * 2, sizeof(rocfft_complex<float>));
 
             chirp(N, M, data->node->direction, (local_fftwf_complex*)chirp_mem.data);
 
-            std::complex<float>* chirp_data = (std::complex<float>*)chirp_mem.data;
+            rocfft_complex<float>* chirp_data = (rocfft_complex<float>*)chirp_mem.data;
 
             for(size_t b = 0; b < howmany; b++)
             {
@@ -823,18 +825,18 @@
         break;
         case CS_KERNEL_FFT_MUL:
         {
-            std::complex<float>* in = (std::complex<float>*)fftwin.data;
-            std::complex<float>* ot = (std::complex<float>*)fftwout.data;
-            size_t               M  = data->node->lengthBlue;
-            size_t               N  = data->node->parent->length[0];
+            rocfft_complex<float>* in = (rocfft_complex<float>*)fftwin.data;
+            rocfft_complex<float>* ot = (rocfft_complex<float>*)fftwout.data;
+            size_t                 M  = data->node->lengthBlue;
+            size_t                 N  = data->node->parent->length[0];
 
             CopyInputVector(data_p, M * 2 * 2 * sizeof(float));
 
-            fftwbuf chirp_mem(M * 2, sizeof(std::complex<float>));
+            fftwbuf chirp_mem(M * 2, sizeof(rocfft_complex<float>));
 
             chirp_fft(N, M, data->node->direction, (local_fftwf_complex*)chirp_mem.data);
 
-            std::complex<float>* chirp_data = (std::complex<float>*)chirp_mem.data;
+            rocfft_complex<float>* chirp_data = (rocfft_complex<float>*)chirp_mem.data;
 
             size_t howmany = data->node->batch;
             for(size_t i = 1; i < data->node->length.size(); i++)
@@ -857,18 +859,18 @@
         break;
         case CS_KERNEL_RES_MUL:
         {
-            std::complex<float>* in = (std::complex<float>*)fftwin.data;
-            std::complex<float>* ot = (std::complex<float>*)fftwout.data;
-            size_t               M  = data->node->lengthBlue;
-            size_t               N  = data->node->length[0];
+            rocfft_complex<float>* in = (rocfft_complex<float>*)fftwin.data;
+            rocfft_complex<float>* ot = (rocfft_complex<float>*)fftwout.data;
+            size_t                 M  = data->node->lengthBlue;
+            size_t                 N  = data->node->length[0];
 
             CopyInputVector(data_p, M * 2 * 2 * sizeof(float));
 
-            fftwbuf chirp_mem(M * 2, sizeof(std::complex<float>));
+            fftwbuf chirp_mem(M * 2, sizeof(rocfft_complex<float>));
 
             chirp(N, M, data->node->direction, (local_fftwf_complex*)chirp_mem.data);
 
-            std::complex<float>* chirp_data = (std::complex<float>*)chirp_mem.data;
+            rocfft_complex<float>* chirp_data = (rocfft_complex<float>*)chirp_mem.data;
 
             size_t howmany = data->node->batch;
             for(size_t i = 1; i < data->node->length.size(); i++)
@@ -941,7 +943,7 @@
             break;
         }
 
-        fftwbuf tmp_mem(out_size, sizeof(std::complex<float>));
+        fftwbuf tmp_mem(out_size, sizeof(rocfft_complex<float>));
 
         // Copy the device information to out local buffer:
         hipMemcpy(tmp_mem.data, bufOut, tmp_mem.bufsize(), hipMemcpyDeviceToHost);
@@ -1014,8 +1016,8 @@
 
         // compare library results vs CPU results
         // TODO: what about real-valued outputs?
-        std::complex<float>* lb = (std::complex<float>*)libout.data;
-        std::complex<float>* ot = (std::complex<float>*)fftwout.data;
+        rocfft_complex<float>* lb = (rocfft_complex<float>*)libout.data;
+        rocfft_complex<float>* ot = (rocfft_complex<float>*)fftwout.data;
         for(size_t i = 0; i < checklength; i++)
         {
             double ac_r = lb[i].real();
@@ -1037,7 +1039,7 @@
         rocfft_cout << "---------------------------------------------" << std::endl;
 
 #if 0
-        std::complex<float>* in      = (std::complex<float>*)fftwin.data;
+        rocfft_complex<float>* in      = (rocfft_complex<float>*)fftwin.data;
 
         rocfft_cout << "input:" << std::endl;
         for(size_t i = 0; i < fftwin.size; ++i)
diff -Nru rocfft-5.5.0/library/src/include/repo.h rocfft-5.7.1/library/src/include/repo.h
--- rocfft-5.5.0/library/src/include/repo.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/repo.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 
-// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -32,7 +32,7 @@
 
     // key structure for 1D twiddles - these are the arguments to
     // twiddle creation
-    struct repo_key_1D_t
+    struct repo_twd_key_1D_t
     {
         // twiddle table length
         size_t           length       = 0;
@@ -46,7 +46,7 @@
         // twiddles
         int deviceId = 0;
 
-        bool operator<(const repo_key_1D_t& other) const
+        bool operator<(const repo_twd_key_1D_t& other) const
         {
             if(length != other.length)
                 return length < other.length;
@@ -64,7 +64,7 @@
         }
     };
     // key structure for 2D twiddles
-    struct repo_key_2D_t
+    struct repo_twd_key_2D_t
     {
         size_t           length0   = 0;
         size_t           length1   = 0;
@@ -73,7 +73,7 @@
         // twiddles
         int deviceId = 0;
 
-        bool operator<(const repo_key_2D_t& other) const
+        bool operator<(const repo_twd_key_2D_t& other) const
         {
             if(length0 != other.length0)
                 return length0 < other.length0;
@@ -84,6 +84,24 @@
             return deviceId < other.deviceId;
         }
     };
+    // key structure for chirp table
+    struct repo_chirp_key_t
+    {
+        size_t           length    = 0;
+        rocfft_precision precision = rocfft_precision_single;
+        // buffers are in device memory, so we need per-device
+        // chirps
+        int deviceId = 0;
+
+        bool operator<(const repo_chirp_key_t& other) const
+        {
+            if(length != other.length)
+                return length < other.length;
+            if(precision != other.precision)
+                return precision < other.precision;
+            return deviceId < other.deviceId;
+        }
+    };
 
     // twiddle tables are buffers in device memory, along with a
     // reference count
@@ -91,13 +109,19 @@
     // NOTE: some buffers might be more shareable here (e.g. simple
     // 1D might match half of a 2D twiddle, or a simple 1D might be
     // shareable with a same-length attach_halfN buffer)
-    std::map<repo_key_1D_t, std::pair<gpubuf, unsigned int>> twiddles_1D;
-    std::map<repo_key_2D_t, std::pair<gpubuf, unsigned int>> twiddles_2D;
+    std::map<repo_twd_key_1D_t, std::pair<gpubuf, unsigned int>> twiddles_1D;
+    std::map<repo_twd_key_2D_t, std::pair<gpubuf, unsigned int>> twiddles_2D;
+
+    std::map<repo_chirp_key_t, std::pair<gpubuf, unsigned int>> chirp;
+
     // reverse-map the device pointers back to the keys so users can
     // free the pointer they were given
-    std::map<void*, repo_key_1D_t> twiddles_1D_reverse;
-    std::map<void*, repo_key_2D_t> twiddles_2D_reverse;
-    static std::mutex              mtx;
+    std::map<void*, repo_twd_key_1D_t> twiddles_1D_reverse;
+    std::map<void*, repo_twd_key_2D_t> twiddles_2D_reverse;
+
+    std::map<void*, repo_chirp_key_t> chirp_reverse;
+
+    static std::mutex mtx;
 
     // internal helpers to get and free twiddles
     template <typename KeyType>
@@ -111,6 +135,18 @@
                                         std::map<KeyType, std::pair<gpubuf, unsigned int>>&,
                                         std::map<void*, KeyType>&);
 
+    // internal helpers to get and free chirp table
+    template <typename KeyType>
+    static std::pair<void*, size_t>
+        GetChirpInternal(KeyType,
+                         std::map<KeyType, std::pair<gpubuf, unsigned int>>&,
+                         std::map<void*, KeyType>&,
+                         std::function<gpubuf(unsigned int)>);
+    template <typename KeyType>
+    static void ReleaseChirpInternal(void* ptr,
+                                     std::map<KeyType, std::pair<gpubuf, unsigned int>>&,
+                                     std::map<void*, KeyType>&);
+
 public:
     // repo is a singleton, so no copying or assignment
     Repo(const Repo&) = delete;
@@ -130,14 +166,20 @@
     static std::pair<void*, size_t> GetTwiddles1D(size_t                     length,
                                                   size_t                     length_limit,
                                                   rocfft_precision           precision,
+                                                  const char*                gpu_arch,
                                                   size_t                     largeTwdBase,
                                                   bool                       attach_halfN,
                                                   const std::vector<size_t>& radices);
+    static std::pair<void*, size_t> GetTwiddles2D(size_t           length0,
+                                                  size_t           length1,
+                                                  rocfft_precision precision,
+                                                  const char*      gpu_arch);
     static std::pair<void*, size_t>
-                GetTwiddles2D(size_t length0, size_t length1, rocfft_precision precision);
+                GetChirp(size_t length, rocfft_precision precision, const char* gpu_arch);
     static void ReleaseTwiddle1D(void* ptr);
     static void ReleaseTwiddle2D(void* ptr);
-    // remove cached twiddles
+    static void ReleaseChirp(void* ptr);
+    // remove cached twiddles/chirp
     static void Clear();
 
     // Repo is a singleton that should only be destroyed on static
diff -Nru rocfft-5.5.0/library/src/include/rocfft_hip.h rocfft-5.7.1/library/src/include/rocfft_hip.h
--- rocfft-5.5.0/library/src/include/rocfft_hip.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/rocfft_hip.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,26 +0,0 @@
-// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-#ifndef __ROCFFT_HIP_H__
-#define __ROCFFT_HIP_H__
-
-#include <hip/hip_runtime_api.h>
-
-#endif // __ROCFFT_HIP_H__
diff -Nru rocfft-5.5.0/library/src/include/rocfft_ostream.hpp rocfft-5.7.1/library/src/include/rocfft_ostream.hpp
--- rocfft-5.5.0/library/src/include/rocfft_ostream.hpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/rocfft_ostream.hpp	2023-08-09 16:19:51.000000000 +0000
@@ -21,9 +21,9 @@
 #ifndef _ROCFFT_OSTREAM_HPP_
 #define _ROCFFT_OSTREAM_HPP_
 
+#include "../../../shared/rocfft_complex.h"
 #include "rocfft.h"
 #include <cmath>
-#include <complex>
 #include <condition_variable>
 #include <cstdint>
 #include <cstdio>
diff -Nru rocfft-5.5.0/library/src/include/rtc_chirp_gen.h rocfft-5.7.1/library/src/include/rtc_chirp_gen.h
--- rocfft-5.5.0/library/src/include/rtc_chirp_gen.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/include/rtc_chirp_gen.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,35 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef RTC_CHIRP_GEN
+#define RTC_CHIRP_GEN
+
+#include "rocfft.h"
+#include <hip/hip_runtime_api.h>
+#include <string>
+
+static const unsigned int CHIRP_THREADS = 32;
+
+// generate name for chirp-compute kernel
+std::string chirp_rtc_kernel_name(rocfft_precision precision);
+// generate source for chirp-compute kernel
+std::string chirp_rtc(const std::string& kernel_name, rocfft_precision precision);
+
+#endif // RTC_CHIRP_GEN
diff -Nru rocfft-5.5.0/library/src/include/rtc_chirp_kernel.h rocfft-5.7.1/library/src/include/rtc_chirp_kernel.h
--- rocfft-5.5.0/library/src/include/rtc_chirp_kernel.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/include/rtc_chirp_kernel.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,48 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_RTC_CHIRP_KERNEL_H
+#define ROCFFT_RTC_CHIRP_KERNEL_H
+
+#include "rtc_chirp_gen.h"
+#include "rtc_kernel.h"
+
+struct RTCKernelChirp : public RTCKernel
+{
+    // generate chirp kernel from precision
+    static RTCKernelChirp generate(const std::string& gpu_arch, rocfft_precision precision);
+
+    // no DeviceCallIn is available at chirp generation time -
+    // these kernels are launched without it
+    RTCKernelArgs get_launch_args(DeviceCallIn& data) override
+    {
+        return {};
+    }
+
+protected:
+    RTCKernelChirp(const std::string&       kernel_name,
+                   const std::vector<char>& code,
+                   dim3                     gridDim,
+                   dim3                     blockDim)
+        : RTCKernel(kernel_name, code, gridDim, blockDim)
+    {
+    }
+};
+#endif // ROCFFT_RTC_CHIRP_KERNEL_H
diff -Nru rocfft-5.5.0/library/src/include/rtc_kernel.h rocfft-5.7.1/library/src/include/rtc_kernel.h
--- rocfft-5.5.0/library/src/include/rtc_kernel.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/rtc_kernel.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -30,6 +30,7 @@
 #include <string>
 #include <vector>
 
+#include "../../../shared/rocfft_complex.h"
 #include "rtc_generator.h"
 
 struct DeviceCallIn;
@@ -41,7 +42,7 @@
 {
 public:
     RTCKernelArgs() = default;
-    void append_ptr(void* ptr)
+    void append_ptr(const void* ptr)
     {
         append(&ptr, sizeof(void*));
     }
@@ -65,6 +66,15 @@
     {
         append(&f, sizeof(float));
     }
+    void append_half(_Float16 f)
+    {
+        append(&f, sizeof(_Float16));
+    }
+    template <typename T>
+    void append_struct(const T& data)
+    {
+        append(&data, sizeof(T), 8);
+    }
 
     size_t size_bytes() const
     {
@@ -76,15 +86,19 @@
     }
 
 private:
-    void append(void* src, size_t nbytes)
+    void append(const void* src, size_t nbytes, size_t align = 0)
     {
         // values need to be aligned to their width (i.e. 8-byte values
         // need 8-byte alignment, 4-byte needs 4-byte alignment)
+        if(align == 0)
+            align = nbytes;
+
         size_t oldsize = buf.size();
-        size_t padding = oldsize % nbytes ? nbytes - (oldsize % nbytes) : 0;
+        size_t padding = oldsize % align ? align - (oldsize % align) : 0;
         buf.resize(oldsize + padding + nbytes);
         std::copy_n(static_cast<const char*>(src), nbytes, buf.begin() + oldsize + padding);
     }
+
     std::vector<char> buf;
 };
 
@@ -97,8 +111,11 @@
     // node if successful.  returns nullptr if there is no matching
     // supported scheme + problem size.  throws runtime_error on
     // error.
-    static std::shared_future<std::unique_ptr<RTCKernel>> runtime_compile(
-        const TreeNode& node, const std::string& gpu_arch, bool enable_callbacks = false);
+    static std::shared_future<std::unique_ptr<RTCKernel>>
+        runtime_compile(const TreeNode&    node,
+                        const std::string& gpu_arch,
+                        std::string&       kernel_name,
+                        bool               enable_callbacks = false);
 
     virtual ~RTCKernel()
     {
@@ -110,6 +127,7 @@
     // disallow copies, since we expect this to be managed by smart ptr
     RTCKernel(const RTCKernel&) = delete;
     RTCKernel(RTCKernel&&)      = delete;
+
     void operator=(const RTCKernel&) = delete;
 
     // normal launch from within rocFFT execution plan
@@ -121,6 +139,9 @@
                 unsigned int   lds_bytes,
                 hipStream_t    stream = nullptr);
 
+    // normal launch from within rocFFT execution plan
+    bool get_occupancy(dim3 blockDim, unsigned int lds_bytes, int& occupancy);
+
     // Subclasses implement this - each kernel type has different
     // parameters
     virtual RTCKernelArgs get_launch_args(DeviceCallIn& data) = 0;
@@ -148,10 +169,17 @@
         kernel_name_gen_t     generate_name;
         kernel_src_gen_t      generate_src;
         rtckernel_construct_t construct_rtckernel;
-        bool                  valid() const
+
+        virtual bool valid() const
         {
             return generate_name && generate_src && construct_rtckernel;
         }
+        // generator is the correct type, but kernel is already compiled
+        virtual bool is_pre_compiled() const
+        {
+            return false;
+        }
+
         // if known at compile time, the grid parameters of the kernel
         // to launch with
         dim3 gridDim;
@@ -186,7 +214,15 @@
 
 static const char* rtc_precision_name(rocfft_precision precision)
 {
-    return precision == rocfft_precision_single ? "_sp" : "_dp";
+    switch(precision)
+    {
+    case rocfft_precision_single:
+        return "_sp";
+    case rocfft_precision_double:
+        return "_dp";
+    case rocfft_precision_half:
+        return "_half";
+    }
 }
 
 static const char* rtc_precision_type_decl(rocfft_precision precision)
@@ -194,9 +230,11 @@
     switch(precision)
     {
     case rocfft_precision_single:
-        return "typedef float2 scalar_type;\n";
+        return "typedef rocfft_complex<float> scalar_type;\n";
     case rocfft_precision_double:
-        return "typedef double2 scalar_type;\n";
+        return "typedef rocfft_complex<double> scalar_type;\n";
+    case rocfft_precision_half:
+        return "typedef rocfft_complex<_Float16> scalar_type;\n";
     }
 }
 
diff -Nru rocfft-5.5.0/library/src/include/rtc_realcomplex_gen.h rocfft-5.7.1/library/src/include/rtc_realcomplex_gen.h
--- rocfft-5.5.0/library/src/include/rtc_realcomplex_gen.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/rtc_realcomplex_gen.h	2023-08-09 16:19:51.000000000 +0000
@@ -21,11 +21,12 @@
 #ifndef RTC_REAL2COMPLEX_EMBED_GEN
 #define RTC_REAL2COMPLEX_EMBED_GEN
 
-#include "../device/kernels/common.h"
 #include "compute_scheme.h"
 #include "rocfft.h"
 #include "rtc_kernel.h"
 
+#include "../device/kernels/common.h"
+
 #include <vector>
 
 struct RealComplexSpecs
diff -Nru rocfft-5.5.0/library/src/include/rtc_stockham_gen.h rocfft-5.7.1/library/src/include/rtc_stockham_gen.h
--- rocfft-5.5.0/library/src/include/rtc_stockham_gen.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/rtc_stockham_gen.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -24,31 +24,32 @@
 #include <vector>
 
 #include "../device/generator/stockham_gen.h"
-#include "../device/kernels/common.h"
 #include "compute_scheme.h"
 #include "rocfft.h"
 #include "rtc_kernel.h"
 
+#include "../device/kernels/common.h"
+
 // generate name for RTC stockham kernel
-std::string stockham_rtc_kernel_name(ComputeScheme           scheme,
-                                     size_t                  length1D,
-                                     size_t                  length2D,
-                                     size_t                  static_dim,
-                                     int                     direction,
-                                     rocfft_precision        precision,
-                                     rocfft_result_placement placement,
-                                     rocfft_array_type       inArrayType,
-                                     rocfft_array_type       outArrayType,
-                                     bool                    unitstride,
-                                     size_t                  largeTwdBase,
-                                     size_t                  largeTwdSteps,
-                                     bool                    largeTwdBatchIsTransformCount,
-                                     EmbeddedType            ebtype,
-                                     DirectRegType           dir2regMode,
-                                     IntrinsicAccessType     intrinsicMode,
-                                     SBRC_TRANSPOSE_TYPE     transpose_type,
-                                     bool                    enable_callbacks,
-                                     bool                    enable_scaling);
+std::string stockham_rtc_kernel_name(const StockhamGeneratorSpecs& specs,
+                                     const StockhamGeneratorSpecs& specs2d,
+                                     ComputeScheme                 scheme,
+                                     int                           direction,
+                                     rocfft_precision              precision,
+                                     rocfft_result_placement       placement,
+                                     rocfft_array_type             inArrayType,
+                                     rocfft_array_type             outArrayType,
+                                     bool                          unitstride,
+                                     size_t                        largeTwdBase,
+                                     size_t                        largeTwdSteps,
+                                     bool                          largeTwdBatchIsTransformCount,
+                                     EmbeddedType                  ebtype,
+                                     DirectRegType                 dir2regMode,
+                                     IntrinsicAccessType           intrinsicMode,
+                                     SBRC_TRANSPOSE_TYPE           transpose_type,
+                                     bool                          enable_callbacks,
+                                     bool                          enable_scaling,
+                                     BluesteinFuseType             fuseBlue);
 
 // generate source for RTC stockham kernel.  transforms_per_block may
 // be nullptr, but if non-null, stockham_rtc stores the number of
@@ -72,6 +73,7 @@
                          IntrinsicAccessType           intrinsicMode,
                          SBRC_TRANSPOSE_TYPE           transpose_type,
                          bool                          enable_callbacks,
-                         bool                          enable_scaling);
+                         bool                          enable_scaling,
+                         const BluesteinFuseType&      fuseBlue);
 
 #endif
diff -Nru rocfft-5.5.0/library/src/include/rtc_stockham_kernel.h rocfft-5.7.1/library/src/include/rtc_stockham_kernel.h
--- rocfft-5.5.0/library/src/include/rtc_stockham_kernel.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/rtc_stockham_kernel.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -37,6 +37,24 @@
 
     virtual RTCKernelArgs get_launch_args(DeviceCallIn& data) override;
 
+protected:
+    struct RTCStockhamGenerator : public RTCKernel::RTCGenerator
+    {
+        // For rtc_stockham_kernel, once it's the correct type from generate_from_node,
+        // we assign the name function.
+        // Changed for tuning framework: since we'd like to get the kernel name information
+        // anyway, even when it's compiled.
+        virtual bool valid() const override
+        {
+            return (generate_name) ? true : false;
+        }
+        // generator is the correct type, but kernel is already compiled
+        virtual bool is_pre_compiled() const override
+        {
+            return generate_name && (!generate_src) && (!construct_rtckernel);
+        }
+    };
+
 private:
     // true if the kernel is hardcoded for a number of dimensions.
     // kernels generated at runtime will be, but ahead-of-time
diff -Nru rocfft-5.5.0/library/src/include/rtc_transpose_gen.h rocfft-5.7.1/library/src/include/rtc_transpose_gen.h
--- rocfft-5.5.0/library/src/include/rtc_transpose_gen.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/rtc_transpose_gen.h	2023-08-09 16:19:51.000000000 +0000
@@ -21,10 +21,11 @@
 #ifndef RTC_TRANSPOSE_GEN
 #define RTC_TRANSPOSE_GEN
 
-#include "../device/kernels/common.h"
 #include "rocfft.h"
 #include "rtc_kernel.h"
 
+#include "../device/kernels/common.h"
+
 struct TransposeSpecs
 {
     unsigned int      tileX;
diff -Nru rocfft-5.5.0/library/src/include/rtc_twiddle_gen.h rocfft-5.7.1/library/src/include/rtc_twiddle_gen.h
--- rocfft-5.5.0/library/src/include/rtc_twiddle_gen.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/include/rtc_twiddle_gen.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,67 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef RTC_TWIDDLE_GEN
+#define RTC_TWIDDLE_GEN
+
+#include "rocfft.h"
+#include <hip/hip_runtime_api.h>
+#include <string>
+
+static const unsigned int TWIDDLES_THREADS         = 32;
+static const unsigned int TWIDDLES_RTC_MAX_RADICES = 8;
+static const double       TWO_PI                   = -6.283185307179586476925286766559;
+
+// structure to pass fixed-length array of radices by value in
+// kernargs instead of by reference in global memory
+struct radices_t
+{
+    size_t data[TWIDDLES_RTC_MAX_RADICES];
+};
+// stringified version of same structure, for RTC
+static const char* radices_t_str{
+    R"_RADICES_T(
+struct radices_t
+{
+    size_t data[TWIDDLES_MAX_RADICES];
+};
+)_RADICES_T"};
+
+enum struct TwiddleTableType
+{
+    // "stacked" table generated from radices (radices multiply out
+    // to length N)
+    RADICES,
+    // N twiddles for length N
+    LENGTH_N,
+    // half-length table for length N
+    HALF_N,
+    // "large" twiddle table, decomposed into pow2 base and multiple
+    // steps
+    LARGE,
+};
+
+// generate name for twiddle-compute kernel
+std::string twiddle_rtc_kernel_name(TwiddleTableType type, rocfft_precision precision);
+// generate source for twiddle-compute kernel
+std::string
+    twiddle_rtc(const std::string& kernel_name, TwiddleTableType type, rocfft_precision precision);
+
+#endif
diff -Nru rocfft-5.5.0/library/src/include/rtc_twiddle_kernel.h rocfft-5.7.1/library/src/include/rtc_twiddle_kernel.h
--- rocfft-5.5.0/library/src/include/rtc_twiddle_kernel.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/include/rtc_twiddle_kernel.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,49 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_RTC_TWIDDLE_KERNEL_H
+#define ROCFFT_RTC_TWIDDLE_KERNEL_H
+
+#include "rtc_kernel.h"
+#include "rtc_twiddle_gen.h"
+
+struct RTCKernelTwiddle : public RTCKernel
+{
+    // generate twiddle kernel from type and precision
+    static RTCKernelTwiddle
+        generate(const std::string& gpu_arch, TwiddleTableType type, rocfft_precision precision);
+
+    // no DeviceCallIn is available at twiddle generation time -
+    // these kernels are launched without it
+    RTCKernelArgs get_launch_args(DeviceCallIn& data) override
+    {
+        return {};
+    }
+
+protected:
+    RTCKernelTwiddle(const std::string&       kernel_name,
+                     const std::vector<char>& code,
+                     dim3                     gridDim,
+                     dim3                     blockDim)
+        : RTCKernel(kernel_name, code, gridDim, blockDim)
+    {
+    }
+};
+#endif
diff -Nru rocfft-5.5.0/library/src/include/solution_map.h rocfft-5.7.1/library/src/include/solution_map.h
--- rocfft-5.5.0/library/src/include/solution_map.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/include/solution_map.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,318 @@
+/******************************************************************************
+* Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*******************************************************************************/
+
+#ifndef SOLUTION_MAP_H
+#define SOLUTION_MAP_H
+
+#include "tree_node.h"
+#include <unordered_map>
+
+#if __has_include(<filesystem>)
+#include <filesystem>
+#else
+#include <experimental/filesystem>
+namespace std
+{
+    namespace filesystem = experimental::filesystem;
+}
+#endif
+
+namespace fs = std::filesystem;
+
+enum SolutionNodeType
+{
+    SOL_DUMMY          = 0, // a reserved solution slot for a root-problem
+    SOL_BUILTIN_KERNEL = 1, // a solution representing a built-in kernel (transpose, r2c, c2r...)
+    SOL_KERNEL_ONLY    = 2, // a solution representing a kernel only, nothing to do with problem
+    SOL_LEAF_NODE      = 3, // a solution representing the tree-leaf-node
+    SOL_INTERNAL_NODE  = 4, // a solution with tree decomposition
+};
+
+std::string      PrintSolutionNodeType(const SolutionNodeType snt);
+SolutionNodeType StrToSolutionNodeType(const std::string& str);
+
+// arch, problem-size token
+struct ProblemKey
+{
+    std::string arch;
+    std::string probToken;
+
+    ProblemKey() = default;
+
+    ProblemKey(const std::string& arch, const std::string& probToken)
+        : arch(arch)
+        , probToken(probToken)
+    {
+    }
+
+    bool operator==(const ProblemKey& rhs) const
+    {
+        return (arch == rhs.arch) && (probToken == rhs.probToken);
+    }
+
+    bool operator<(const ProblemKey& rhs) const
+    {
+        if(arch != rhs.arch)
+            return arch < rhs.arch;
+
+        return probToken < rhs.probToken;
+    }
+};
+
+template <>
+struct ToString<ProblemKey>;
+
+template <>
+struct FromString<ProblemKey>;
+
+struct ProbKeyHash
+{
+    size_t operator()(const ProblemKey& k) const noexcept
+    {
+        size_t h = 0;
+        h ^= std::hash<std::string>{}(k.arch);
+        h ^= std::hash<std::string>{}(k.probToken);
+        return h;
+    }
+};
+
+struct SolutionPtr
+{
+    std::string child_token  = "";
+    size_t      child_option = 0;
+
+    bool operator==(const SolutionPtr& rhs) const
+    {
+        return std::tie(child_token, child_option) == std::tie(rhs.child_token, rhs.child_option);
+    }
+};
+
+// Implementing the ToString / FromString (data_descriptor.h)
+// for writing-to/reading-from texted-format solution map
+template <>
+struct ToString<SolutionPtr>;
+
+template <>
+struct FromString<SolutionPtr>;
+
+struct SolutionNode;
+
+using SolutionNodeVec = std::vector<SolutionNode>;
+
+using ProbSolMap = std::unordered_map<ProblemKey, SolutionNodeVec, ProbKeyHash>;
+
+struct SolutionNode
+{
+    SolutionNodeType sol_node_type = SOL_INTERNAL_NODE;
+    ComputeScheme    using_scheme  = CS_NONE;
+    FMKey            kernel_key    = EmptyFMKey;
+    // like the childnodes on tree-node, a childnode could be internal/leaf/kernel-node
+    std::vector<SolutionPtr> solution_childnodes;
+
+    SolutionNode()                    = default;
+    SolutionNode(const SolutionNode&) = default;
+
+    static SolutionNode DummySolutionNode()
+    {
+        static SolutionNode dummy;
+        dummy.sol_node_type = SOL_DUMMY;
+        return dummy;
+    }
+
+    SolutionNode& operator=(const SolutionNode&) = default;
+
+    bool operator==(const SolutionNode& rhs) const
+    {
+        return std::tie(sol_node_type, using_scheme, kernel_key, solution_childnodes)
+               == std::tie(
+                   rhs.sol_node_type, rhs.using_scheme, rhs.kernel_key, rhs.solution_childnodes);
+    }
+
+    // NB:
+    //  The following are only assigned when calling "remove solution"
+    //  in normal case, we don't assign anything to them
+    //  if a node is deleted, then its parent_sol_node should be deleted
+    std::vector<SolutionNode*> parent_sol_nodes;
+    //  if a node is changing its position, then its parent_sol_ptr should update it option_id
+    std::vector<SolutionPtr*> parent_sol_ptrs;
+    //  use to find the SolutionNodeVec containing itself
+    SolutionNodeVec* self_vec;
+    // flag indicating this to be removed, we assign marks first and remove them all at once
+    bool to_be_removed = false;
+};
+
+template <>
+struct ToString<SolutionNode>;
+
+template <>
+struct FromString<SolutionNode>;
+
+using SolMapEntry = std::pair<ProblemKey, SolutionNodeVec>;
+
+template <>
+struct ToString<SolMapEntry>;
+
+template <>
+struct FromString<SolMapEntry>;
+
+inline bool ProbSolCmp(const SolMapEntry& lhs, const SolMapEntry& rhs)
+{
+    // 1st, compare the node type and schemes (from value)
+    const auto& last_node_lhs = lhs.second.back();
+    const auto& last_node_rhs = rhs.second.back();
+
+    if(last_node_lhs.sol_node_type == last_node_rhs.sol_node_type)
+    {
+        ComputeScheme scheme_lhs = last_node_lhs.using_scheme;
+        ComputeScheme scheme_rhs = last_node_rhs.using_scheme;
+
+        // 2nd, compare the prob_token (from key)
+        if(scheme_lhs == scheme_rhs)
+            return lhs.first < rhs.first;
+
+        return scheme_lhs < scheme_rhs;
+    }
+    return last_node_lhs.sol_node_type < last_node_rhs.sol_node_type;
+}
+
+class solution_map
+{
+    friend class SolutionMapConverter;
+
+    bool assume_latest_ver = true;
+
+    int        self_version = 0;
+    ProbSolMap primary_sol_map;
+    ProbSolMap temp_working_map;
+
+    ROCFFT_EXPORT solution_map();
+
+private:
+    // a private function version of add_solution which can be called only by ctor.
+    // That is, the implementation of solution_map() generated by solution-shipping.py
+    size_t add_solution_private(const ProblemKey& probKey, const SolutionNode& solution);
+
+    // check if two solution nodes have identical semantic
+    bool SolutionNodesAreEqual(const SolutionNode& lhs,
+                               const SolutionNode& rhs,
+                               const std::string&  arch,
+                               bool                primary_map);
+
+    bool remove_solution_bottom_up(SolutionNodeVec& nodeVec, SolutionNode& node, size_t pos);
+
+    void generate_link_info();
+
+public:
+    // the latest version number of solution-map's format
+    static const int VERSION;
+
+    // a default kernel-token for any built-in kernel
+    static const char* KERNEL_TOKEN_BUILTIN_KERNEL;
+
+    // a default leafnode-token for leafnodes linking to built-in kernels
+    static const char* LEAFNODE_TOKEN_BUILTIN_KERNEL;
+
+    solution_map(const solution_map&) = delete;
+
+    solution_map& operator=(const solution_map&) = delete;
+
+    static solution_map& get_solution_map()
+    {
+        static solution_map sol_map;
+        return sol_map;
+    }
+
+    ~solution_map() = default;
+
+    void setup();
+
+    bool
+        has_solution_node(const ProblemKey& probKey, size_t option_id = 0, bool primary_map = true);
+
+    SolutionNode&
+        get_solution_node(const ProblemKey& probKey, size_t option_id = 0, bool primary_map = true);
+
+    FMKey& get_solution_kernel(const ProblemKey& probKey,
+                               size_t            option_id   = 0,
+                               bool              primary_map = true);
+
+    // setup a solution of a problem and insert to the map, should be called by a benchmarker
+    size_t add_solution(const ProblemKey&               probKey,
+                        TreeNode*                       currentNode,
+                        const std::vector<SolutionPtr>& children,
+                        bool                            isRootProb,
+                        bool                            check_dup,
+                        bool                            primary_map = true);
+
+    // add a solution of a problem to the map, should be called by a benchmarker
+    size_t add_solution(const ProblemKey& probKey,
+                        const FMKey&      kernel_key,
+                        bool              check_dup,
+                        bool              primary_map = true);
+
+    // directly insert a solution of a problem to the map, should be called by a benchmarker
+    // NB:
+    // (The following is for ComputeSchemeIsAProblem):
+    //   For the root-prob, we want the root-solution to be always at option 0, and make
+    //   it "Exclusively" used by that root-problem. So we ALWAYS put the root-solution
+    //   at the beginning of the solution vector and don't need to check_dup.
+    //
+    //   Furthermore, since this means the option-0 is reserved for root-prob only, so when doing
+    //   check_dup, we start comparing from the second element.
+    size_t add_solution(const ProblemKey&   probKey,
+                        const SolutionNode& solution,
+                        bool                isRootProb,
+                        bool                check_dup,
+                        bool                primary_map = true);
+
+    // parse the format version of the input file
+    bool get_solution_map_version(const fs::path& sol_map_in_path);
+
+    // read the map from input stream
+    bool read_solution_map_data(const fs::path& sol_map_in_path, bool primary_map = true);
+
+    // write the map to output stream,
+    // sort = output the entries in order, which is helpful when comparing before/after merging
+    bool write_solution_map_data(const fs::path& sol_map_out_path,
+                                 bool            sort        = true,
+                                 bool            primary_map = true);
+
+    // merge solutions from src_file to primary map
+    bool merge_solutions_from_file(const fs::path&                src_file,
+                                   const std::vector<ProblemKey>& root_probs);
+};
+
+class SolutionMapConverter
+{
+private:
+    // ver.0 -> ver.1: remove unused and invalid/incorrect kernels
+    //               : caused by using un-supported half_lds in sbrc/sbcr kernels
+    bool remove_invalid_half_lds();
+
+public:
+    SolutionMapConverter()  = default;
+    ~SolutionMapConverter() = default;
+
+    bool VersionCheckAndConvert(const std::string& in_map_path, const std::string& out_map_path);
+};
+
+#endif // SOLUTION_MAP_H
diff -Nru rocfft-5.5.0/library/src/include/transform.h rocfft-5.7.1/library/src/include/transform.h
--- rocfft-5.5.0/library/src/include/transform.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/transform.h	2023-08-09 16:19:51.000000000 +0000
@@ -21,7 +21,7 @@
 #ifndef TRANSFORM_H
 #define TRANSFORM_H
 
-#include "rocfft_hip.h"
+#include "../../../shared/rocfft_hip.h"
 
 struct rocfft_execution_info_t
 {
diff -Nru rocfft-5.5.0/library/src/include/tree_node.h rocfft-5.7.1/library/src/include/tree_node.h
--- rocfft-5.5.0/library/src/include/tree_node.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/tree_node.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -30,23 +30,16 @@
 #include <vector>
 
 #include "../../../shared/gpubuf.h"
+#include "../../../shared/rocfft_complex.h"
 #include "../device/kernels/callback.h"
 #include "../device/kernels/common.h"
 #include "compute_scheme.h"
+#include "enum_printer.h"
+#include "function_map_key.h"
 #include "kargs.h"
 #include "rtc_kernel.h"
 #include <hip/hip_runtime_api.h>
 
-enum OperatingBuffer
-{
-    OB_UNINIT              = 0b00000,
-    OB_USER_IN             = 0b00001,
-    OB_USER_OUT            = 0b00010,
-    OB_TEMP                = 0b00100,
-    OB_TEMP_CMPLX_FOR_REAL = 0b01000,
-    OB_TEMP_BLUESTEIN      = 0b10000,
-};
-
 enum NodeType
 {
     NT_UNDEFINED, // un init
@@ -65,20 +58,6 @@
     FT_STOCKHAM_R2C_TRANSPOSE, // Stokham + post-r2c + transpose (Advance of FT_R2C_TRANSPOSE)
 };
 
-// TODO: move this to rocfft.h and allow users to select via plan description
-// the decision strategy for buffer assigment
-enum rocfft_optimize_strategy
-{
-    rocfft_optimize_min_buffer, // minimize number of buffers, possibly fewer fusions
-    rocfft_optimize_balance, // balance between buffer and fusion
-    rocfft_optimize_max_fusion, // maximize number of fusions, possibly more buffers
-};
-
-std::string PrintOperatingBuffer(const OperatingBuffer ob);
-std::string PrintOperatingBufferCode(const OperatingBuffer ob);
-std::string PrintSBRCTransposeType(const SBRC_TRANSPOSE_TYPE ty);
-std::string PrintDirectToFromRegMode(const DirectRegType ty);
-
 typedef void (*DevFnCall)(const void*, void*);
 
 struct GridParam
@@ -100,6 +79,47 @@
     }
 };
 
+// get device property
+static hipDeviceProp_t get_curr_device_prop()
+{
+    hipDeviceProp_t prop;
+    int             deviceId = 0;
+    if(hipGetDevice(&deviceId) != hipSuccess)
+        throw std::runtime_error("hipGetDevice failed.");
+
+    if(hipGetDeviceProperties(&prop, deviceId) != hipSuccess)
+        throw std::runtime_error("hipGetDeviceProperties failed for deviceId "
+                                 + std::to_string(deviceId));
+
+    return prop;
+}
+
+// get the arch name, as a part of key of solution map
+static std::string get_arch_name(const hipDeviceProp_t& prop)
+{
+    static const std::vector<std::string> arch_list = {"gfx803",
+                                                       "gfx900",
+                                                       "gfx906",
+                                                       "gfx908",
+                                                       "gfx90a",
+                                                       "gfx1030",
+                                                       "gfx1100",
+                                                       "gfx1101",
+                                                       "gfx1102"};
+
+    static const std::string anyArch("any");
+    std::string              archName(prop.gcnArchName);
+
+    for(const auto& arch : arch_list)
+    {
+        if(archName.find(arch) != std::string::npos)
+            return arch;
+    }
+
+    // kind of a fall-back solution
+    return anyArch;
+}
+
 static bool is_device_gcn_arch(const hipDeviceProp_t& prop, const std::string& cmpTarget)
 {
     std::string archName(prop.gcnArchName);
@@ -117,18 +137,36 @@
     return length.size() == 3 && length[0] == length[1] && length[1] == length[2];
 }
 
-inline size_t sizeof_precision(rocfft_precision precision)
+// Given a map of precision-length exceptions, check whether the
+// length is present.  Assume half-precision has the same exceptions
+// as single-precision.
+static bool length_excepted(const std::map<rocfft_precision, std::set<size_t>>& exceptions,
+                            rocfft_precision                                    precision,
+                            size_t                                              length)
+{
+    if(precision == rocfft_precision_half)
+        precision = rocfft_precision_single;
+    return exceptions.at(precision).count(length);
+}
+
+void get_large_twd_base_steps(size_t large1DLen, bool use3steps, size_t& base, size_t& steps);
+
+struct SchemeTree
 {
-    switch(precision)
+    ComputeScheme                            curScheme;
+    size_t                                   numKernels = 0;
+    std::vector<std::unique_ptr<SchemeTree>> children;
+
+    SchemeTree() {}
+    SchemeTree(ComputeScheme s)
+        : curScheme(s)
     {
-    case rocfft_precision_single:
-        return 2 * sizeof(float);
-    case rocfft_precision_double:
-        return 2 * sizeof(double);
     }
-    assert(false);
-    return 0;
-}
+};
+
+using SchemeVec = std::vector<ComputeScheme>;
+
+static SchemeVec EmptySchemeVec = {};
 
 class TreeNode;
 
@@ -140,7 +178,9 @@
     std::vector<size_t>     length;
     std::vector<size_t>     outputLength;
     std::vector<size_t>     inStride, outStride;
+    std::vector<size_t>     inStrideBlue, outStrideBlue;
     size_t                  iDist = 0, oDist = 0;
+    size_t                  iDistBlue = 0, oDistBlue = 0;
     size_t                  iOffset = 0, oOffset = 0;
     int                     direction    = -1;
     rocfft_result_placement placement    = rocfft_placement_inplace;
@@ -266,9 +306,15 @@
     // Stride of the FFT in each dimension
     std::vector<size_t> inStride, outStride;
 
+    // Stride of the fused Bluestein FFT in each dimension
+    std::vector<size_t> inStrideBlue, outStrideBlue;
+
     // Distance between consecutive batch members:
     size_t iDist = 0, oDist = 0;
 
+    // Distance between consecutive batch members in fused Bluestein nodes
+    size_t iDistBlue = 0, oDistBlue = 0;
+
     // Offsets to start of data in buffer:
     size_t iOffset = 0, oOffset = 0;
 
@@ -294,7 +340,7 @@
     bool largeTwd3Steps = false;
     // "Steps": how many exact loops we need to decompose the LTWD?
     // if we pass this as a template arg in kernel, should avoid dynamic while-loop
-    // We will update this in set_large_twd_base_steps()
+    // We will update this in get_large_twd_base_steps()
     size_t ltwdSteps = 0;
     // true if large twd multiply uses batch as transform count - this
     // is done on strided large 1D FFTs where the batch dimension moves
@@ -308,7 +354,10 @@
     DirectRegType dir2regMode = DirectRegType::FORCE_OFF_OR_NOT_SUPPORT;
 
     // sbrc transpose type
-    SBRC_TRANSPOSE_TYPE sbrcTranstype = SBRC_TRANSPOSE_TYPE::NONE;
+    mutable SBRC_TRANSPOSE_TYPE sbrcTranstype = SBRC_TRANSPOSE_TYPE::NONE;
+
+    // specified kernel key from solution map. (if there is any)
+    std::unique_ptr<FMKey> specified_key;
 
     // Tree structure:
     // non-owning pointer to parent node, may be null
@@ -326,7 +375,13 @@
     // Length of the FFT for computing zero-padded linear convolutions
     // in Bluestein's algorithm. If Bluestein is required to compute an
     // FFT of length N, then lengthBlue >= 2N - 1.
-    size_t lengthBlue = 0;
+    size_t lengthBlue  = 0;
+    size_t lengthBlueN = 0;
+
+    //
+    BluesteinType     typeBlue   = BluesteinType::BT_NONE;
+    BluesteinFuseType fuseBlue   = BluesteinFuseType::BFT_NONE;
+    bool              need_chirp = false;
 
     // Device pointers:
     // twiddle memory is owned by the repo
@@ -334,6 +389,8 @@
     size_t           twiddles_size       = 0;
     void*            twiddles_large      = nullptr;
     size_t           twiddles_large_size = 0;
+    void*            chirp               = nullptr;
+    size_t           chirp_size          = 0;
     gpubuf_t<size_t> devKernArg;
 
     // callback parameters
@@ -404,8 +461,10 @@
         return false;
     }
 
-    virtual void RecursiveBuildTree(); // Main tree builder: override by child
-    virtual void SanityCheck();
+    void RecursiveBuildTree(SchemeTree* solution_scheme = nullptr);
+
+    virtual void SanityCheck(SchemeTree*         solution_scheme = nullptr,
+                             std::vector<FMKey>& kernel_keys     = EmptyFMKeyVec);
     // If high dims are contiguous, we can collapse them to make offset
     // calculation simpler
     void CollapseContiguousDims();
@@ -440,11 +499,14 @@
                                size_t& chirpSize);
 
     // Output plan information for debug purposes:
-    void Print(rocfft_ostream& os, int indent = 0) const;
+    virtual void Print(rocfft_ostream& os, int indent = 0) const;
 
     // logic B - using in-place transposes, todo
     //void RecursiveBuildTreeLogicB();
 
+    void RecursiveFindChildNodes(const ComputeScheme& scheme, std::vector<TreeNode*>& nodes);
+    void RecursiveCopyNodeData(const TreeNode& srcNode);
+
     void RecursiveRemoveNode(TreeNode* node);
 
     // insert a newNode before the node "pos"
@@ -474,10 +536,10 @@
         return false;
     }
 
-    virtual bool KernelCheck()                                             = 0;
-    virtual bool CreateDevKernelArgs()                                     = 0;
-    virtual bool CreateTwiddleTableResource()                              = 0;
-    virtual void SetupGridParamAndFuncPtr(DevFnCall& fnPtr, GridParam& gp) = 0;
+    virtual bool KernelCheck(std::vector<FMKey>& kernel_keys = EmptyFMKeyVec) = 0;
+    virtual bool CreateDevKernelArgs()                                        = 0;
+    virtual bool CreateDeviceResources()                                      = 0;
+    virtual void SetupGridParamAndFuncPtr(DevFnCall& fnPtr, GridParam& gp)    = 0;
 
     // for 3D SBRC kernels, decide the transpose type based on the
     // block width and lengths that the block tiles need to align on.
@@ -487,6 +549,16 @@
         return NONE;
     }
 
+    // default implementation of leaf node, for non-sbrc type without sbrc_trans
+    virtual FMKey GetKernelKey() const
+    {
+        if(specified_key)
+            return *specified_key.get();
+
+        return (dimension == 1) ? fpkey(length[0], precision, scheme)
+                                : fpkey(length[0], length[1], precision, scheme);
+    }
+
     // Compute the large twd decomposition base
     void set_large_twd_base_steps(size_t largeTWDLength);
 
@@ -496,8 +568,8 @@
     bool IsBluesteinChirpSetup();
 
 protected:
-    virtual void BuildTree_internal()    = 0;
-    virtual void AssignParams_internal() = 0;
+    virtual void BuildTree_internal(const SchemeVec& child_schemes = EmptySchemeVec) = 0;
+    virtual void AssignParams_internal()                                             = 0;
 };
 
 class InternalNode : public TreeNode
@@ -517,9 +589,9 @@
         return false;
     }
 
-    bool CreateTwiddleTableResource() override
+    bool CreateDeviceResources() override
     {
-        throw std::runtime_error("Shouldn't call CreateTwiddleTableResource in a non-LeafNode");
+        throw std::runtime_error("Shouldn't call CreateDeviceResources in a non-LeafNode");
         return false;
     }
 
@@ -529,7 +601,7 @@
     }
 
 public:
-    bool KernelCheck() override
+    bool KernelCheck(std::vector<FMKey>& kernel_keys = EmptyFMKeyVec) override
     {
         return true;
     }
@@ -557,9 +629,11 @@
     size_t              wgs              = 0;
     size_t              lds              = 0;
 
-    void           BuildTree_internal() final {} // nothing to do in leaf node
-    void           AssignParams_internal() final {} // nothing to do in leaf node
-    bool           CreateLargeTwdTable();
+    void BuildTree_internal(const SchemeVec& child_schemes = EmptySchemeVec) final {
+    } // nothing to do in leaf node
+    void AssignParams_internal() final {} // nothing to do in leaf node
+    bool CreateLargeTwdTable();
+
     virtual size_t GetTwiddleTableLength();
     // Limit length of generated twiddle table.  Default limit is 0,
     // which means to generate the full length of table.
@@ -569,11 +643,16 @@
     }
     virtual void SetupGPAndFnPtr_internal(DevFnCall& fnPtr, GridParam& gp) = 0;
 
-    bool         KernelCheck() override;
-    void         SanityCheck() override;
+public:
+    // leaf node would print additional informations about kernel setting
+    void         Print(rocfft_ostream& os, int indent = 0) const override;
+    bool         KernelCheck(std::vector<FMKey>& kernel_keys = EmptyFMKeyVec) override;
+    void         SanityCheck(SchemeTree*         solution_scheme = nullptr,
+                             std::vector<FMKey>& kernel_keys     = EmptyFMKeyVec) override;
     virtual bool CreateDevKernelArgs() override;
-    bool         CreateTwiddleTableResource() override;
+    bool         CreateDeviceResources() override;
     void         SetupGridParamAndFuncPtr(DevFnCall& fnPtr, GridParam& gp) override;
+    FMKey        GetKernelKey() const override;
     virtual void GetKernelFactors();
 };
 
@@ -620,6 +699,12 @@
     // are the nodes that do actual work
     std::vector<TreeNode*> execSeq;
 
+    // kernels that extracted from solution map
+    std::vector<FMKey> solution_kernels;
+
+    // scheme decompositions from solution map
+    std::unique_ptr<SchemeTree> rootScheme;
+
     // flattened potentially-fusable shims of rootPlan
     std::vector<FuseShim*> fuseShims;
 
@@ -631,6 +716,12 @@
     std::vector<size_t> iLength;
     std::vector<size_t> oLength;
 
+    // Indicates whether this is a standalone chirp plan
+    // in multi-kernel Bluestein implementations (buffers
+    // in the standalone plan are not connected with the
+    // rest of the nodes in the fft plan).
+    bool IsChirpPlan;
+
     // default: starting from ABT, balance buffers and fusions
     // we could allow users to set in the later PR
     rocfft_optimize_strategy assignOptStrategy = rocfft_optimize_balance;
@@ -654,6 +745,10 @@
     std::pair<TreeNode*, TreeNode*> get_load_store_nodes() const;
 };
 
+std::unique_ptr<SchemeTree> ApplySolution(ExecPlan& execPlan);
+
+// get a min_token (without batch, stride, offset...) of a node, for generating a prob-key
+void GetNodeToken(const TreeNode& probNode, std::string& min_token, std::string& full_token);
 void ProcessNode(ExecPlan& execPlan);
 void PrintNode(rocfft_ostream& os, const ExecPlan& execPlan);
 
diff -Nru rocfft-5.5.0/library/src/include/tree_node_1D.h rocfft-5.7.1/library/src/include/tree_node_1D.h
--- rocfft-5.5.0/library/src/include/tree_node_1D.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/tree_node_1D.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -37,7 +37,7 @@
         scheme = CS_L1D_TRTRT;
     }
     void AssignParams_internal() override;
-    void BuildTree_internal() override;
+    void BuildTree_internal(const SchemeVec& child_schemes = EmptySchemeVec) override;
 };
 
 /*****************************************************
@@ -54,7 +54,7 @@
         scheme = CS_L1D_CC;
     }
     void AssignParams_internal() override;
-    void BuildTree_internal() override;
+    void BuildTree_internal(const SchemeVec& child_schemes = EmptySchemeVec) override;
 };
 
 /*****************************************************
@@ -71,7 +71,7 @@
         scheme = CS_L1D_CRT;
     }
     void AssignParams_internal() override;
-    void BuildTree_internal() override;
+    void BuildTree_internal(const SchemeVec& child_schemes = EmptySchemeVec) override;
 };
 
 /*****************************************************
@@ -92,7 +92,7 @@
     void SetupGPAndFnPtr_internal(DevFnCall& fnPtr, GridParam& gp) override;
 
 public:
-    bool                CreateTwiddleTableResource() override;
+    bool                CreateDeviceResources() override;
     std::vector<size_t> CollapsibleDims() override;
     bool                UseOutputLengthForPadding() override
     {
@@ -119,13 +119,19 @@
 
     void SetupGPAndFnPtr_internal(DevFnCall& fnPtr, GridParam& gp) override;
 
-    void SetDirectRegType();
+    // InitIntrinsicMode is the first step to check if eligible for buffer load/store
+    void InitIntrinsicMode();
 
-    void SetIntrinsicMode();
+    // manually disable the functionality from benchmark result
+    //     the settings are results of an observation, kinda we tune the param in hardcode
+    // NB: When during tuning, we should not do the manual setting things.
+    //     We should just use the exact setting from the specified config
+    void TuneDirectRegType();
+    void TuneIntrinsicMode();
 
 public:
     // we can put codes here to switch-on/off some features at arch-wise
-    bool KernelCheck() override;
+    bool KernelCheck(std::vector<FMKey>& kernel_keys = EmptyFMKeyVec) override;
 
     // reads + writes are along columns so both may benefit from padding
     bool PaddingBenefitsInput() override
@@ -157,13 +163,16 @@
 
     void SetupGPAndFnPtr_internal(DevFnCall& fnPtr, GridParam& gp) override;
 
-    void SetDirectRegType();
+    void TuneDirectRegType();
 
 public:
     SBRC_TRANSPOSE_TYPE sbrc_transpose_type(unsigned int blockWidth) const override;
 
     // we can put codes here to switch-on/off some features at arch-wise
-    bool KernelCheck() override;
+    bool KernelCheck(std::vector<FMKey>& kernel_keys = EmptyFMKeyVec) override;
+
+    // override for sbrcTransType
+    FMKey GetKernelKey() const override;
 
     // writes are along columns so they may benefit from padding
     bool PaddingBenefitsOutput() override
@@ -194,15 +203,17 @@
 
     void SetupGPAndFnPtr_internal(DevFnCall& fnPtr, GridParam& gp) override;
 
-    void SetDirectRegType();
+    // InitIntrinsicMode is the first step to check if eligible for buffer load/store
+    void InitIntrinsicMode();
 
-    void SetIntrinsicMode();
+    void TuneDirectRegType();
+    void TuneIntrinsicMode();
 
 public:
     // we can put codes here to switch-on/off some features at arch-wise
-    bool KernelCheck() override;
+    bool KernelCheck(std::vector<FMKey>& kernel_keys = EmptyFMKeyVec) override;
 
-    bool CreateTwiddleTableResource() override;
+    bool CreateDeviceResources() override;
 
     // reads are along columns so they may benefit from padding
     bool PaddingBenefitsInput() override
diff -Nru rocfft-5.5.0/library/src/include/tree_node_2D.h rocfft-5.7.1/library/src/include/tree_node_2D.h
--- rocfft-5.5.0/library/src/include/tree_node_2D.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/tree_node_2D.h	2023-08-09 16:19:51.000000000 +0000
@@ -38,7 +38,7 @@
         scheme = CS_2D_RTRT;
     }
     void AssignParams_internal() override;
-    void BuildTree_internal() override;
+    void BuildTree_internal(const SchemeVec& child_schemes = EmptySchemeVec) override;
 };
 
 /*****************************************************
@@ -55,7 +55,7 @@
         scheme = CS_2D_RC;
     }
     void AssignParams_internal() override;
-    void BuildTree_internal() override;
+    void BuildTree_internal(const SchemeVec& child_schemes = EmptySchemeVec) override;
 };
 
 /*****************************************************
@@ -76,7 +76,7 @@
     void SetupGPAndFnPtr_internal(DevFnCall& fnPtr, GridParam& gp) override;
 
 public:
-    bool CreateTwiddleTableResource() override;
+    bool CreateDeviceResources() override;
 };
 
 #endif // TREE_NODE_2D_H
diff -Nru rocfft-5.5.0/library/src/include/tree_node_3D.h rocfft-5.7.1/library/src/include/tree_node_3D.h
--- rocfft-5.5.0/library/src/include/tree_node_3D.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/tree_node_3D.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -42,7 +42,7 @@
 
 protected:
     void AssignParams_internal() override;
-    void BuildTree_internal() override;
+    void BuildTree_internal(const SchemeVec& child_schemes = EmptySchemeVec) override;
 };
 
 /*****************************************************
@@ -63,7 +63,7 @@
         scheme = CS_3D_TRTRTR;
     }
     void AssignParams_internal() override;
-    void BuildTree_internal() override;
+    void BuildTree_internal(const SchemeVec& child_schemes = EmptySchemeVec) override;
 };
 
 /*****************************************************
@@ -86,7 +86,7 @@
         scheme = CS_3D_BLOCK_RC;
     }
     void AssignParams_internal() override;
-    void BuildTree_internal() override;
+    void BuildTree_internal(const SchemeVec& child_schemes = EmptySchemeVec) override;
 };
 
 /*****************************************************
@@ -107,7 +107,7 @@
         scheme = CS_3D_BLOCK_CR;
     }
     void AssignParams_internal() override;
-    void BuildTree_internal() override;
+    void BuildTree_internal(const SchemeVec& child_schemes = EmptySchemeVec) override;
 };
 
 /*****************************************************
@@ -130,7 +130,7 @@
     }
 
     void AssignParams_internal() override;
-    void BuildTree_internal() override;
+    void BuildTree_internal(const SchemeVec& child_schemes = EmptySchemeVec) override;
 };
 
 /*****************************************************
@@ -155,10 +155,10 @@
         return 0;
     }
 
-    void SetDirectRegType();
+    void TuneDirectRegType();
 
 public:
-    bool KernelCheck() override;
+    bool KernelCheck(std::vector<FMKey>& kernel_keys = EmptyFMKeyVec) override;
 
     SBRC_TRANSPOSE_TYPE sbrc_transpose_type(unsigned int blockWidth) const override
     {
@@ -175,6 +175,9 @@
             return TILE_ALIGNED;
         return TILE_UNALIGNED;
     }
+
+    // override for sbrcTransType
+    FMKey GetKernelKey() const override;
 };
 
 /*****************************************************
diff -Nru rocfft-5.5.0/library/src/include/tree_node_bluestein.h rocfft-5.7.1/library/src/include/tree_node_bluestein.h
--- rocfft-5.5.0/library/src/include/tree_node_bluestein.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/tree_node_bluestein.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -36,8 +36,12 @@
     {
         scheme = CS_BLUESTEIN;
     }
-    void AssignParams_internal() override;
-    void BuildTree_internal() override;
+    void          AssignParams_internal() override;
+    void          BuildTree_internal(const SchemeVec& child_schemes = EmptySchemeVec) override;
+    BluesteinType DecideBlueType();
+
+public:
+    static size_t FindBlue(size_t len, rocfft_precision precision, bool forcePow2);
 };
 
 /*****************************************************
@@ -58,7 +62,7 @@
     // check if the specified 1D length fits into single-kernel Bluestein
     static bool SizeFits(size_t length, rocfft_precision precision);
 
-    bool KernelCheck() override
+    bool KernelCheck(std::vector<FMKey>& kernel_keys = EmptyFMKeyVec) override
     {
         GetKernelFactors();
         return true;
diff -Nru rocfft-5.5.0/library/src/include/tree_node_real.h rocfft-5.7.1/library/src/include/tree_node_real.h
--- rocfft-5.5.0/library/src/include/tree_node_real.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/tree_node_real.h	2023-08-09 16:19:51.000000000 +0000
@@ -37,7 +37,7 @@
         scheme = CS_REAL_TRANSFORM_USING_CMPLX;
     }
     void AssignParams_internal() override;
-    void BuildTree_internal() override;
+    void BuildTree_internal(const SchemeVec& child_schemes = EmptySchemeVec) override;
 
 public:
     bool UseOutputLengthForPadding() override
@@ -60,7 +60,7 @@
         scheme = CS_REAL_TRANSFORM_EVEN;
     }
     void AssignParams_internal() override;
-    void BuildTree_internal() override;
+    void BuildTree_internal(const SchemeVec& child_schemes = EmptySchemeVec) override;
 
 public:
     // 3D Even can possibly set this
@@ -97,7 +97,7 @@
         scheme = CS_REAL_2D_EVEN;
     }
     void     AssignParams_internal() override;
-    void     BuildTree_internal() override;
+    void     BuildTree_internal(const SchemeVec& child_schemes = EmptySchemeVec) override;
     Solution solution = TR_PAIR;
 
     void BuildTree_internal_SBCC();
@@ -134,7 +134,7 @@
         scheme = CS_REAL_3D_EVEN;
     }
     void AssignParams_internal() override;
-    void BuildTree_internal() override;
+    void BuildTree_internal(const SchemeVec& child_schemes = EmptySchemeVec) override;
 
     Solution solution = TR_PAIRS;
 
diff -Nru rocfft-5.5.0/library/src/include/tuning_helper.h rocfft-5.7.1/library/src/include/tuning_helper.h
--- rocfft-5.5.0/library/src/include/tuning_helper.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/include/tuning_helper.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,177 @@
+
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef TUNING_HELPER_H
+#define TUNING_HELPER_H
+
+#include <iostream>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "solution_map.h"
+
+struct BenchmarkInfo
+{
+    std::string prob_token;
+    std::string kernel_name;
+    std::string factors_str; // factors string as "[a, b, c]" as a feild in CSV
+    std::string util_rate; // utilization is define as "how many butterflies per thread does"
+    int         tuning_phase; // phase-0/1, see comments of PropagateBestFactorsToNextPhase()
+    int         SSN; // the serial number of the kernel-candidate
+    int         num_blocks; // following is information of current kernel execution
+    int         workgroup_size;
+    int         threads_per_trans;
+    int         trans_per_block;
+    int         LDS_bytes;
+    int         occupancy;
+    int         numCUs;
+    double      milli_seconds;
+    double      gflops;
+    double      granularity;
+    double      bw_eff;
+};
+
+struct rocfft_tuning_packet
+{
+    // tuning result, vector size is num_nodes
+    std::string              tuning_arch_name;
+    std::string              tuning_problem_name;
+    std::string              output_solution_map_path;
+    std::vector<std::string> tuning_kernel_tokens;
+    std::vector<std::string> kernel_names;
+    std::vector<std::string> factors_str;
+    std::vector<std::string> util_rate;
+    std::vector<bool>        is_builtin_kernel;
+    std::vector<double>      bw_effs;
+    std::vector<size_t>      num_of_blocks;
+    std::vector<size_t>      wgs;
+    std::vector<size_t>      tpt;
+    std::vector<size_t>      tpb;
+    std::vector<size_t>      lds_bytes;
+    std::vector<int>         occupancy; // we allow -1, indicating the RTC kernel compiled failed
+    int                      numCUs;
+
+    // setting
+    bool dump_candidates   = false;
+    bool export_full_token = false;
+    // reserved, indicating if we dump a full token,
+    // making the solution exclusively use by that exact problem
+
+    // tuning status
+    bool             init_step      = false;
+    bool             is_tuning      = false;
+    int              total_nodes    = 0;
+    int              tuning_node_id = -1;
+    int              current_ssn    = -1;
+    int              tuning_phase   = 0; // 0: no factor permutation; 1: permute
+    std::vector<int> total_candidates;
+
+    // size is #-nodes, each elem indicates in which phase, which id, what name is the winner
+    std::vector<int>         winner_phases;
+    std::vector<int>         winner_ids;
+    std::vector<std::string> winner_kernel_names;
+
+    // size is #-nodes, each elem is the target_factors of this node
+    std::vector<std::set<std::string>> target_factors;
+
+    rocfft_tuning_packet() = default;
+};
+
+class TuningBenchmarker
+{
+private:
+    solution_map*                         binding_solution_map = nullptr;
+    std::unique_ptr<rocfft_tuning_packet> packet               = nullptr;
+    // outter vector size is #-nodes, each elem is a vector with size = #- kernel-candidates
+    std::vector<std::vector<BenchmarkInfo>> benchmark_infos_of_node;
+
+    void ResetKernelInfo();
+
+    TuningBenchmarker() = default;
+
+public:
+    TuningBenchmarker(const TuningBenchmarker&) = delete;
+
+    TuningBenchmarker& operator=(const TuningBenchmarker&) = delete;
+
+    static TuningBenchmarker& GetSingleton()
+    {
+        static TuningBenchmarker singleton;
+        return singleton;
+    }
+
+    ~TuningBenchmarker();
+
+    // create packet
+    void Setup();
+
+    // release packet
+    void Clean();
+
+    rocfft_tuning_packet* GetPacket();
+
+    void SetBindingSolutionMap(solution_map* sol_map);
+
+    solution_map* GetBindingSolutionMap();
+
+    // the status query
+    bool IsInitializingTuning();
+    bool IsProcessingTuning();
+
+    bool SetInitStep(int tuning_phase);
+
+    int UpdateNumOfTuningNodes();
+
+    int GetNumOfKernelCandidates(size_t node_id);
+
+    bool SetCurrentTuningNodeId(size_t node_id);
+
+    bool SetCurrentKernelCandidateId(size_t kernel_config_id);
+
+    BenchmarkInfo GetCurrBenchmarkInfo();
+
+    void UpdateCurrBenchResult(double ms, double gflops);
+
+    void FindWinnerForCurrNode(double&      curr_best_msec,
+                               int&         winner_phase,
+                               int&         winner_config_id,
+                               std::string& winner_kernel_name);
+
+    void ExportWinnerToSolutions();
+
+    // We do a 2-phase tuning:
+    //  phase 0: tuning without permuting the factors, and get best 3 factors sets
+    //  phase 1: propagate the best 3 factors to phase 1, and do permutation.
+    void PropagateBestFactorsToNextPhase();
+
+    void GetOutputSolutionMapPath(std::string& out_path);
+
+    bool ExportCSV(bool append_data = false);
+
+    bool MergingSolutionsMaps(const std::string& base_map_path,
+                              const std::string& new_map_path,
+                              const std::string& probKeyStr,
+                              const std::string& out_map_path);
+};
+
+#endif // TUNING_PACKET_H
\ No newline at end of file
diff -Nru rocfft-5.5.0/library/src/include/tuning_kernel_tuner.h rocfft-5.7.1/library/src/include/tuning_kernel_tuner.h
--- rocfft-5.5.0/library/src/include/tuning_kernel_tuner.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/include/tuning_kernel_tuner.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,29 @@
+// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef KERNEL_TUNER_H
+#define KERNEL_TUNER_H
+
+#include "tree_node.h"
+#include "tuning_helper.h"
+
+void EnumerateKernelConfigs(const ExecPlan& execPlan);
+
+#endif
\ No newline at end of file
diff -Nru rocfft-5.5.0/library/src/include/tuning_plan_tuner.h rocfft-5.7.1/library/src/include/tuning_plan_tuner.h
--- rocfft-5.5.0/library/src/include/tuning_plan_tuner.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/include/tuning_plan_tuner.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,31 @@
+// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef PLAN_TUNER_H
+#define PLAN_TUNER_H
+
+#include "tree_node.h"
+#include "tuning_helper.h"
+
+// return size_t: the "option_id" of the return node in its sol-vector
+size_t SerializeTree(TreeNode* node, std::string& archName);
+void   EnumerateTrees(ExecPlan& execPlan);
+
+#endif
\ No newline at end of file
diff -Nru rocfft-5.5.0/library/src/include/twiddles.h rocfft-5.7.1/library/src/include/twiddles.h
--- rocfft-5.5.0/library/src/include/twiddles.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/include/twiddles.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -25,17 +25,20 @@
 #include "rocfft.h"
 #include <vector>
 
-static const size_t LTWD_BASE_DEFAULT       = 8;
-static const size_t LARGE_TWIDDLE_THRESHOLD = 4096;
+static const size_t       LTWD_BASE_DEFAULT       = 8;
+static const size_t       LARGE_TWIDDLE_THRESHOLD = 4096;
+static const unsigned int TWIDDLES_MAX_RADICES    = 8;
 
 gpubuf twiddles_create(size_t                     N,
                        size_t                     length_limit,
                        rocfft_precision           precision,
+                       const char*                gpu_arch,
                        size_t                     largeTwdBase,
                        bool                       attach_halfN,
                        const std::vector<size_t>& radices,
                        unsigned int               deviceId);
-gpubuf twiddles_create_2D(size_t N1, size_t N2, rocfft_precision precision, unsigned int deviceId);
+gpubuf twiddles_create_2D(
+    size_t N1, size_t N2, rocfft_precision precision, const char* gpu_arch, unsigned int deviceId);
 
 void twiddle_streams_cleanup();
 
diff -Nru rocfft-5.5.0/library/src/kargs.cpp rocfft-5.7.1/library/src/kargs.cpp
--- rocfft-5.5.0/library/src/kargs.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/kargs.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -21,7 +21,7 @@
 *******************************************************************************/
 
 #include "kargs.h"
-#include "rocfft_hip.h"
+#include "../../shared/rocfft_hip.h"
 #include <cassert>
 
 // malloc device buffer; copy host buffer to device buffer
diff -Nru rocfft-5.5.0/library/src/node_factory.cpp rocfft-5.7.1/library/src/node_factory.cpp
--- rocfft-5.5.0/library/src/node_factory.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/node_factory.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -20,6 +20,7 @@
 
 #include "node_factory.h"
 #include "../../shared/arithmetic.h"
+#include "../../shared/precision_type.h"
 #include "function_pool.h"
 #include "fuse_shim.h"
 #include "hip/hip_runtime_api.h"
@@ -301,6 +302,10 @@
 // Checks whether the non-pow2 length input is supported for a Bluestein compute scheme
 bool NodeFactory::NonPow2LengthSupported(rocfft_precision precision, size_t length)
 {
+    // assume half precision behaves the same as single
+    if(precision == rocfft_precision_half)
+        precision = rocfft_precision_single;
+
     // Exceptions which have been found to perform poorly when compared to the next pow2 length
     static const std::map<rocfft_precision, std::set<size_t>> length_exceptions
         = {{rocfft_precision_single,
@@ -309,7 +314,7 @@
             {104,  108,  180,  224,  225,  432,  450,  810,  2401,  2430,  2700,  2880,   3125,
              3200, 3240, 3375, 3456, 3600, 3645, 4913, 6561, 11200, 53248, 57344, 106496, 114688}}};
 
-    if(length_exceptions.at(precision).count(length))
+    if(length_excepted(length_exceptions, precision, length))
         return false;
 
     // Look for regular Stockham kernels support
@@ -329,7 +334,12 @@
     return false;
 }
 
-inline bool SupportedLength(rocfft_precision precision, size_t len)
+size_t NodeFactory::GetBluesteinLength(rocfft_precision precision, size_t len)
+{
+    return BluesteinNode::FindBlue(len, precision, BluesteinSingleNode::SizeFits(len, precision));
+}
+
+bool NodeFactory::SupportedLength(rocfft_precision precision, size_t len)
 {
     // do we have an explicit kernel?
     if(function_pool::has_function(fpkey(len, precision)))
@@ -467,13 +477,41 @@
     }
 }
 
-std::unique_ptr<TreeNode> NodeFactory::CreateExplicitNode(NodeMetaData& nodeData, TreeNode* parent)
+std::unique_ptr<TreeNode> NodeFactory::CreateExplicitNode(NodeMetaData& nodeData,
+                                                          TreeNode*     parent,
+                                                          ComputeScheme determined_scheme)
 {
-    // TreeNode*     p = dummyNode->parent;
-    ComputeScheme s = DecideNodeScheme(nodeData, parent);
-    if(s == CS_NONE)
+    // when creating tree from solution map, scheme is L1D but not root
+    // NB:
+    //   Why we need this:
+    //   Ideally, decide-scheme functions don't assign/change any lengths data,
+    //   it should be assigned before. But that is not the case for L1D, when
+    //   deciding L1D scheme, it appends an extra temporary length indicating
+    //   how to "factorize" the large 1D, and then pop in later in "build-tree."
+    //   But when we are creating tree from solution map, we already have the
+    //   determined_scheme; however, if we don't do the decide-node-scheme,
+    //   we lose that factor-length, and the later "pop" casues error. So
+    //   we should still call the decide-node-scheme here as long as we know
+    //   it's a L1D. (But not for the root-node, root-node calls the decide
+    //   function anyway, before we try looking up solutions)
+    if((determined_scheme == CS_L1D_TRTRT || determined_scheme == CS_L1D_CC
+        || determined_scheme == CS_L1D_CRT)
+       && (parent != nullptr))
+    {
+        auto s = DecideNodeScheme(nodeData, parent);
+        if(determined_scheme != s)
+            throw std::runtime_error("solution map error for L1D sub-problem");
+    }
+
+    // createing tree without solution map, must call DecideNodeScheme
+    if(determined_scheme == CS_NONE)
+        determined_scheme = DecideNodeScheme(nodeData, parent);
+
+    // check if successfully created
+    if(determined_scheme == CS_NONE)
         throw std::runtime_error("DecideNodeScheme Failed!: CS_NONE");
-    auto node = CreateNodeFromScheme(s, parent);
+
+    auto node = CreateNodeFromScheme(determined_scheme, parent);
     node->CopyNodeData(nodeData);
     return node;
 }
@@ -571,7 +609,8 @@
         if(nodeData.length[0] <= block_threshold)
         {
             // Enable block compute under these conditions
-            if(nodeData.precision == rocfft_precision_single)
+            if(nodeData.precision == rocfft_precision_single
+               || nodeData.precision == rocfft_precision_half)
             {
                 if(map1DLengthSingle.find(nodeData.length[0]) != map1DLengthSingle.end())
                 {
@@ -635,7 +674,8 @@
     }
     else // if not Pow2
     {
-        if(nodeData.precision == rocfft_precision_single)
+        if(nodeData.precision == rocfft_precision_single
+           || nodeData.precision == rocfft_precision_half)
         {
             if(map1DLengthSingle.find(nodeData.length[0]) != map1DLengthSingle.end())
             {
@@ -779,7 +819,8 @@
         std::map<rocfft_precision, std::set<size_t>> exceptions
             = {{rocfft_precision_single, {84, 112, 168}},
                {rocfft_precision_double, {84, 108, 112, 168}}};
-        if(childScheme == CS_2D_RC && exceptions.at(nodeData.precision).count(nodeData.length[1])
+        if(childScheme == CS_2D_RC
+           && length_excepted(exceptions, nodeData.precision, nodeData.length[1])
            && nodeData.rootIsC2C)
         {
             return CS_3D_TRTRTR;
@@ -832,7 +873,7 @@
         fpkey(nodeData.length[0], nodeData.length[1], nodeData.precision, CS_KERNEL_2D_SINGLE));
 
     int ldsUsage = nodeData.length[0] * nodeData.length[1] * kernel.transforms_per_block
-                   * sizeof_precision(nodeData.precision);
+                   * complex_type_size(nodeData.precision);
     if(1.5 * ldsUsage > ldsSize)
         return false;
 
@@ -859,8 +900,8 @@
         if(function_pool::has_SBRC_kernel(nodeData.length[i], nodeData.precision))
         {
             // make sure the SBRC kernel on that dimension would be tile-aligned
-            auto kernel = function_pool::get_kernel(
-                fpkey(nodeData.length[i], nodeData.precision, CS_KERNEL_STOCKHAM_BLOCK_RC));
+            auto kernel = function_pool::get_kernel(fpkey(
+                nodeData.length[i], nodeData.precision, CS_KERNEL_STOCKHAM_BLOCK_RC, TILE_ALIGNED));
             if(nodeData.length[(i + 2) % nodeData.length.size()] % kernel.transforms_per_block == 0)
                 ++sbrc_dimensions;
         }
diff -Nru rocfft-5.5.0/library/src/plan.cpp rocfft-5.7.1/library/src/plan.cpp
--- rocfft-5.5.0/library/src/plan.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/plan.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -22,8 +22,10 @@
 #include "../../shared/arithmetic.h"
 #include "../../shared/array_predicate.h"
 #include "../../shared/environment.h"
+#include "../../shared/precision_type.h"
 #include "../../shared/ptrdiff.h"
 #include "assignment_policy.h"
+#include "enum_printer.h"
 #include "function_pool.h"
 #include "hip/hip_runtime_api.h"
 #include "logging.h"
@@ -32,6 +34,9 @@
 #include "rocfft.h"
 #include "rocfft_ostream.hpp"
 #include "rtc_kernel.h"
+#include "solution_map.h"
+#include "tuning_helper.h"
+#include "tuning_plan_tuner.h"
 
 #include <algorithm>
 #include <assert.h>
@@ -45,7 +50,6 @@
 
 #define TO_STR2(x) #x
 #define TO_STR(x) TO_STR2(x)
-#define ENUMSTR(x) x, TO_STR(x)
 
 // clang-format off
 #define ROCFFT_VERSION_STRING (TO_STR(rocfft_version_major) "." \
@@ -54,52 +58,6 @@
                                TO_STR(rocfft_version_tweak) )
 // clang-format on
 
-std::string PrintOperatingBuffer(const OperatingBuffer ob)
-{
-    const std::map<OperatingBuffer, const char*> BuffertoString
-        = {{ENUMSTR(OB_UNINIT)},
-           {ENUMSTR(OB_USER_IN)},
-           {ENUMSTR(OB_USER_OUT)},
-           {ENUMSTR(OB_TEMP)},
-           {ENUMSTR(OB_TEMP_CMPLX_FOR_REAL)},
-           {ENUMSTR(OB_TEMP_BLUESTEIN)}};
-    return BuffertoString.at(ob);
-}
-
-std::string PrintOperatingBufferCode(const OperatingBuffer ob)
-{
-    const std::map<OperatingBuffer, const char*> BuffertoString = {{OB_UNINIT, "ERR"},
-                                                                   {OB_USER_IN, "A"},
-                                                                   {OB_USER_OUT, "B"},
-                                                                   {OB_TEMP, "T"},
-                                                                   {OB_TEMP_CMPLX_FOR_REAL, "C"},
-                                                                   {OB_TEMP_BLUESTEIN, "S"}};
-    return BuffertoString.at(ob);
-}
-
-std::string PrintOptimizeStrategy(const rocfft_optimize_strategy ros)
-{
-    const std::map<rocfft_optimize_strategy, const char*> StrategytoString
-        = {{rocfft_optimize_min_buffer, "MINIMIZE_BUFFER"},
-           {rocfft_optimize_balance, "BALANCE_BUFFER_FUSION"},
-           {rocfft_optimize_max_fusion, "MAXIMIZE_FUSION"}};
-    return StrategytoString.at(ros);
-}
-
-std::string PrintSBRCTransposeType(const SBRC_TRANSPOSE_TYPE ty)
-{
-    const std::map<SBRC_TRANSPOSE_TYPE, const char*> TypetoString = {
-        {ENUMSTR(NONE)}, {ENUMSTR(DIAGONAL)}, {ENUMSTR(TILE_ALIGNED)}, {ENUMSTR(TILE_UNALIGNED)}};
-    return TypetoString.at(ty);
-}
-
-std::string PrintDirectToFromRegMode(const DirectRegType ty)
-{
-    const std::map<DirectRegType, const char*> TypetoString
-        = {{ENUMSTR(FORCE_OFF_OR_NOT_SUPPORT)}, {ENUMSTR(TRY_ENABLE_IF_SUPPORT)}};
-    return TypetoString.at(ty);
-}
-
 rocfft_status rocfft_plan_description_set_scale_factor(rocfft_plan_description description,
                                                        const double            scale_factor)
 {
@@ -118,6 +76,201 @@
                : 1;
 }
 
+void rocfft_plan_description_t::init_defaults(rocfft_transform_type        transformType,
+                                              rocfft_result_placement      placement,
+                                              size_t                       rank,
+                                              const std::array<size_t, 3>& lengths)
+{
+    // assume interleaved data
+    if(inArrayType == rocfft_array_type_unset)
+    {
+        switch(transformType)
+        {
+        case rocfft_transform_type_complex_forward:
+        case rocfft_transform_type_complex_inverse:
+            inArrayType = rocfft_array_type_complex_interleaved;
+            break;
+        case rocfft_transform_type_real_inverse:
+            inArrayType = rocfft_array_type_hermitian_interleaved;
+            break;
+        case rocfft_transform_type_real_forward:
+            inArrayType = rocfft_array_type_real;
+            break;
+        }
+    }
+    if(outArrayType == rocfft_array_type_unset)
+    {
+        switch(transformType)
+        {
+        case rocfft_transform_type_complex_forward:
+        case rocfft_transform_type_complex_inverse:
+            outArrayType = rocfft_array_type_complex_interleaved;
+            break;
+        case rocfft_transform_type_real_forward:
+            outArrayType = rocfft_array_type_hermitian_interleaved;
+            break;
+        case rocfft_transform_type_real_inverse:
+            outArrayType = rocfft_array_type_real;
+            break;
+        }
+    }
+
+    // Set inStrides, if not specified
+    if(inStrides[0] == 0)
+    {
+        inStrides[0] = 1;
+
+        if((transformType == rocfft_transform_type_real_forward)
+           && (placement == rocfft_placement_inplace))
+        {
+            // real-to-complex in-place
+            size_t dist = 2 * (1 + (lengths[0]) / 2);
+
+            for(size_t i = 1; i < rank; i++)
+            {
+                inStrides[i] = dist;
+                dist *= lengths[i];
+            }
+
+            if(inDist == 0)
+                inDist = dist;
+        }
+        else if(transformType == rocfft_transform_type_real_inverse)
+        {
+            // complex-to-real
+            size_t dist = 1 + (lengths[0]) / 2;
+
+            for(size_t i = 1; i < rank; i++)
+            {
+                inStrides[i] = dist;
+                dist *= lengths[i];
+            }
+
+            if(inDist == 0)
+                inDist = dist;
+        }
+
+        else
+        {
+            // Set the inStrides to deal with contiguous data
+            for(size_t i = 1; i < rank; i++)
+                inStrides[i] = lengths[i - 1] * inStrides[i - 1];
+        }
+    }
+
+    // Set outStrides, if not specified
+    if(outStrides[0] == 0)
+    {
+        outStrides[0] = 1;
+
+        if((transformType == rocfft_transform_type_real_inverse)
+           && (placement == rocfft_placement_inplace))
+        {
+            // complex-to-real in-place
+            size_t dist = 2 * (1 + (lengths[0]) / 2);
+
+            for(size_t i = 1; i < rank; i++)
+            {
+                outStrides[i] = dist;
+                dist *= lengths[i];
+            }
+
+            if(outDist == 0)
+                outDist = dist;
+        }
+        else if(transformType == rocfft_transform_type_real_forward)
+        {
+            // real-to-complex
+            size_t dist = 1 + (lengths[0]) / 2;
+
+            for(size_t i = 1; i < rank; i++)
+            {
+                outStrides[i] = dist;
+                dist *= lengths[i];
+            }
+
+            if(outDist == 0)
+                outDist = dist;
+        }
+        else
+        {
+            // Set the outStrides to deal with contiguous data
+            for(size_t i = 1; i < rank; i++)
+                outStrides[i] = lengths[i - 1] * outStrides[i - 1];
+        }
+    }
+
+    // Set in and out Distances, if not specified
+    if(inDist == 0)
+    {
+        inDist = lengths[rank - 1] * inStrides[rank - 1];
+    }
+    if(outDist == 0)
+    {
+        outDist = lengths[rank - 1] * outStrides[rank - 1];
+    }
+}
+
+void rocfft_plan_t::sort()
+{
+    // copy the lengths + strides separately, and then sort them
+    // fastest to slowest.
+    struct rocfft_iodim
+    {
+        size_t length;
+        size_t istride;
+        size_t ostride;
+    };
+
+    // complex-complex transforms can be freely reordered starting from
+    // the fastest dimension.  real-complex has to leave the fastest
+    // dimension alone
+    const size_t start_dim = (transformType == rocfft_transform_type_complex_forward
+                              || transformType == rocfft_transform_type_complex_inverse)
+                                 ? 0
+                                 : 1;
+
+    std::vector<rocfft_iodim> iodims;
+    for(size_t dim = start_dim; dim < rank; ++dim)
+        iodims.push_back(rocfft_iodim{lengths[dim], desc.inStrides[dim], desc.outStrides[dim]});
+    if(iodims.empty())
+        return;
+
+    bool sort_on_istride = true;
+    auto sorter          = [sort_on_istride](const rocfft_iodim& a, const rocfft_iodim& b) {
+        // move any lengths of 1 to the end
+        if(a.length == 1 && b.length != 1)
+            return false;
+        if(b.length == 1 && a.length != 1)
+            return true;
+        return sort_on_istride ? (a.istride < b.istride) : (a.ostride < b.ostride);
+    };
+
+    // sort on istride first
+    std::sort(iodims.begin(), iodims.end(), sorter);
+
+    // if that means ostride is no longer sorted, then don't bother
+    // changing anything - the user is asking for some kind of
+    // transposed FFT so let's just assume they know what they're doing
+    sort_on_istride = false;
+    if(!std::is_sorted(iodims.begin(), iodims.end(), sorter))
+        return;
+
+    // chop off any lengths of 1 from the end
+    while(iodims.size() > 1 && iodims.back().length == 1)
+    {
+        --rank;
+        iodims.pop_back();
+    }
+    // copy back the sorted lengths + strides
+    for(size_t dim = start_dim; dim < rank; ++dim)
+    {
+        lengths[dim]         = iodims[dim - start_dim].length;
+        desc.inStrides[dim]  = iodims[dim - start_dim].istride;
+        desc.outStrides[dim] = iodims[dim - start_dim].ostride;
+    }
+}
+
 rocfft_status rocfft_plan_description_set_data_layout(rocfft_plan_description description,
                                                       const rocfft_array_type in_array_type,
                                                       const rocfft_array_type out_array_type,
@@ -219,8 +372,8 @@
 
     rider << "-t " << plan->transformType << " ";
 
-    if(plan->precision == rocfft_precision_double)
-        rider << "--double ";
+    rider << "--precision ";
+    rider << precision_name(plan->precision) << " ";
     rider << "--itype " << plan->desc.inArrayType << " ";
     rider << "--otype " << plan->desc.outArrayType << " ";
     rider << "--istride ";
@@ -239,6 +392,120 @@
     return rider.str();
 }
 
+void set_bluestein_strides(const rocfft_plan plan, NodeMetaData& planData)
+{
+    std::array<size_t, 3> inStridesBlue  = {0, 0, 0};
+    std::array<size_t, 3> outStridesBlue = {0, 0, 0};
+    std::array<size_t, 3> lengthsBlue    = {0, 0, 0};
+    size_t                inDistBlue     = 0;
+    size_t                outDistBlue    = 0;
+
+    const auto precision     = plan->precision;
+    const auto transformType = plan->transformType;
+    const auto rank          = plan->rank;
+    const auto lengths       = plan->lengths;
+    const auto placement     = plan->placement;
+    const auto dimension     = planData.dimension;
+
+    assert(rank == dimension);
+
+    lengthsBlue[0] = NodeFactory::SupportedLength(precision, lengths[0])
+                         ? lengths[0]
+                         : NodeFactory::GetBluesteinLength(precision, lengths[0]);
+    for(size_t i = 1; i < dimension; i++)
+        lengthsBlue[i] = lengths[i];
+
+    // =================================
+    // inStrides
+    // =================================
+    inStridesBlue[0] = 1;
+
+    if((transformType == rocfft_transform_type_real_forward)
+       && (placement == rocfft_placement_inplace))
+    {
+        // real-to-complex in-place
+        size_t dist = 2 * (1 + (lengthsBlue[0]) / 2);
+
+        for(size_t i = 1; i < rank; i++)
+        {
+            inStridesBlue[i] = dist;
+            dist *= lengthsBlue[i];
+        }
+
+        inDistBlue = dist;
+    }
+    else if(transformType == rocfft_transform_type_real_inverse)
+    {
+        // complex-to-real
+        size_t dist = 1 + (lengthsBlue[0]) / 2;
+
+        for(size_t i = 1; i < rank; i++)
+        {
+            inStridesBlue[i] = dist;
+            dist *= lengthsBlue[i];
+        }
+
+        inDistBlue = dist;
+    }
+    else
+    {
+        // Set the inStrides to deal with contiguous data
+        for(size_t i = 1; i < rank; i++)
+            inStridesBlue[i] = lengthsBlue[i - 1] * inStridesBlue[i - 1];
+
+        inDistBlue = lengthsBlue[rank - 1] * inStridesBlue[rank - 1];
+    }
+
+    // =================================
+    // outStrides
+    // =================================
+    outStridesBlue[0] = 1;
+
+    if((transformType == rocfft_transform_type_real_forward)
+       && (placement == rocfft_placement_inplace))
+    {
+        // real-to-complex in-place
+        size_t dist = 2 * (1 + (lengthsBlue[0]) / 2);
+
+        for(size_t i = 1; i < rank; i++)
+        {
+            outStridesBlue[i] = dist;
+            dist *= lengthsBlue[i];
+        }
+
+        outDistBlue = dist;
+    }
+    else if(transformType == rocfft_transform_type_real_inverse)
+    {
+        // complex-to-real
+        size_t dist = 1 + (lengthsBlue[0]) / 2;
+
+        for(size_t i = 1; i < rank; i++)
+        {
+            outStridesBlue[i] = dist;
+            dist *= lengthsBlue[i];
+        }
+
+        outDistBlue = dist;
+    }
+    else
+    {
+        // Set the inStrides to deal with contiguous data
+        for(size_t i = 1; i < rank; i++)
+            outStridesBlue[i] = lengthsBlue[i - 1] * outStridesBlue[i - 1];
+
+        outDistBlue = lengthsBlue[rank - 1] * outStridesBlue[rank - 1];
+    }
+
+    for(size_t i = 0; i < dimension; i++)
+    {
+        planData.inStrideBlue.push_back(inStridesBlue[i]);
+        planData.outStrideBlue.push_back(outStridesBlue[i]);
+    }
+    planData.iDistBlue = inDistBlue;
+    planData.oDistBlue = outDistBlue;
+}
+
 rocfft_status rocfft_plan_create_internal(rocfft_plan                   plan,
                                           const rocfft_result_placement placement,
                                           const rocfft_transform_type   transform_type,
@@ -248,58 +515,6 @@
                                           const size_t                  number_of_transforms,
                                           const rocfft_plan_description description)
 {
-    // Check plan validity
-    if(description != nullptr)
-    {
-        switch(transform_type)
-        {
-        case rocfft_transform_type_complex_forward:
-        case rocfft_transform_type_complex_inverse:
-            // We need complex input data
-            if(!((description->inArrayType == rocfft_array_type_complex_interleaved)
-                 || (description->inArrayType == rocfft_array_type_complex_planar)))
-                return rocfft_status_invalid_array_type;
-            // We need complex output data
-            if(!((description->outArrayType == rocfft_array_type_complex_interleaved)
-                 || (description->outArrayType == rocfft_array_type_complex_planar)))
-                return rocfft_status_invalid_array_type;
-            // In-place transform requires that the input and output
-            // format be identical
-            if(placement == rocfft_placement_inplace)
-            {
-                if(description->inArrayType != description->outArrayType)
-                    return rocfft_status_invalid_array_type;
-            }
-            break;
-        case rocfft_transform_type_real_forward:
-            // Input must be real
-            if(description->inArrayType != rocfft_array_type_real)
-                return rocfft_status_invalid_array_type;
-            // Output must be Hermitian
-            if(!((description->outArrayType == rocfft_array_type_hermitian_interleaved)
-                 || (description->outArrayType == rocfft_array_type_hermitian_planar)))
-                return rocfft_status_invalid_array_type;
-            // In-place transform must output to interleaved format
-            if((placement == rocfft_placement_inplace)
-               && (description->outArrayType != rocfft_array_type_hermitian_interleaved))
-                return rocfft_status_invalid_array_type;
-            break;
-        case rocfft_transform_type_real_inverse:
-            // Output must be real
-            if(description->outArrayType != rocfft_array_type_real)
-                return rocfft_status_invalid_array_type;
-            // Intput must be Hermitian
-            if(!((description->inArrayType == rocfft_array_type_hermitian_interleaved)
-                 || (description->inArrayType == rocfft_array_type_hermitian_planar)))
-                return rocfft_status_invalid_array_type;
-            // In-place transform must have interleaved input
-            if((placement == rocfft_placement_inplace)
-               && (description->inArrayType != rocfft_array_type_hermitian_interleaved))
-                return rocfft_status_invalid_array_type;
-            break;
-        }
-    }
-
     if(dimensions > 3)
         return rocfft_status_invalid_dimensions;
 
@@ -315,127 +530,66 @@
     p->batch          = number_of_transforms;
     p->placement      = placement;
     p->precision      = precision;
-    p->base_type_size = (precision == rocfft_precision_double) ? sizeof(double) : sizeof(float);
+    p->base_type_size = real_type_size(precision);
     p->transformType  = transform_type;
 
     if(description != nullptr)
     {
         p->desc = *description;
     }
-    else
-    {
-        switch(transform_type)
-        {
-        case rocfft_transform_type_complex_forward:
-        case rocfft_transform_type_complex_inverse:
-            p->desc.inArrayType  = rocfft_array_type_complex_interleaved;
-            p->desc.outArrayType = rocfft_array_type_complex_interleaved;
-            break;
-        case rocfft_transform_type_real_forward:
-            p->desc.inArrayType  = rocfft_array_type_real;
-            p->desc.outArrayType = rocfft_array_type_hermitian_interleaved;
-            break;
-        case rocfft_transform_type_real_inverse:
-            p->desc.inArrayType  = rocfft_array_type_hermitian_interleaved;
-            p->desc.outArrayType = rocfft_array_type_real;
-            break;
-        }
-    }
-
-    // Set inStrides, if not specified
-    if(p->desc.inStrides[0] == 0)
-    {
-        p->desc.inStrides[0] = 1;
-
-        if((p->transformType == rocfft_transform_type_real_forward)
-           && (p->placement == rocfft_placement_inplace))
-        {
-            // real-to-complex in-place
-            size_t dist = 2 * (1 + (p->lengths[0]) / 2);
+    p->desc.init_defaults(p->transformType, p->placement, p->rank, p->lengths);
 
-            for(size_t i = 1; i < (p->rank); i++)
-            {
-                p->desc.inStrides[i] = dist;
-                dist *= p->lengths[i];
-            }
-
-            if(p->desc.inDist == 0)
-                p->desc.inDist = dist;
-        }
-        else if(p->transformType == rocfft_transform_type_real_inverse)
-        {
-            // complex-to-real
-            size_t dist = 1 + (p->lengths[0]) / 2;
-
-            for(size_t i = 1; i < (p->rank); i++)
-            {
-                p->desc.inStrides[i] = dist;
-                dist *= p->lengths[i];
-            }
-
-            if(p->desc.inDist == 0)
-                p->desc.inDist = dist;
-        }
-
-        else
-        {
-            // Set the inStrides to deal with contiguous data
-            for(size_t i = 1; i < (p->rank); i++)
-                p->desc.inStrides[i] = p->lengths[i - 1] * p->desc.inStrides[i - 1];
-        }
-    }
-
-    // Set outStrides, if not specified
-    if(p->desc.outStrides[0] == 0)
+    // Check plan validity
+    switch(transform_type)
     {
-        p->desc.outStrides[0] = 1;
-
-        if((p->transformType == rocfft_transform_type_real_inverse)
-           && (p->placement == rocfft_placement_inplace))
-        {
-            // complex-to-real in-place
-            size_t dist = 2 * (1 + (p->lengths[0]) / 2);
-
-            for(size_t i = 1; i < (p->rank); i++)
-            {
-                p->desc.outStrides[i] = dist;
-                dist *= p->lengths[i];
-            }
-
-            if(p->desc.outDist == 0)
-                p->desc.outDist = dist;
-        }
-        else if(p->transformType == rocfft_transform_type_real_forward)
-        {
-            // real-co-complex
-            size_t dist = 1 + (p->lengths[0]) / 2;
-
-            for(size_t i = 1; i < (p->rank); i++)
-            {
-                p->desc.outStrides[i] = dist;
-                dist *= p->lengths[i];
-            }
-
-            if(p->desc.outDist == 0)
-                p->desc.outDist = dist;
-        }
-        else
+    case rocfft_transform_type_complex_forward:
+    case rocfft_transform_type_complex_inverse:
+        // We need complex input data
+        if(!((p->desc.inArrayType == rocfft_array_type_complex_interleaved)
+             || (p->desc.inArrayType == rocfft_array_type_complex_planar)))
+            return rocfft_status_invalid_array_type;
+        // We need complex output data
+        if(!((p->desc.outArrayType == rocfft_array_type_complex_interleaved)
+             || (p->desc.outArrayType == rocfft_array_type_complex_planar)))
+            return rocfft_status_invalid_array_type;
+        // In-place transform requires that the input and output
+        // format be identical
+        if(placement == rocfft_placement_inplace)
         {
-            // Set the outStrides to deal with contiguous data
-            for(size_t i = 1; i < (p->rank); i++)
-                p->desc.outStrides[i] = p->lengths[i - 1] * p->desc.outStrides[i - 1];
+            if(p->desc.inArrayType != p->desc.outArrayType)
+                return rocfft_status_invalid_array_type;
         }
+        break;
+    case rocfft_transform_type_real_forward:
+        // Input must be real
+        if(p->desc.inArrayType != rocfft_array_type_real)
+            return rocfft_status_invalid_array_type;
+        // Output must be Hermitian
+        if(!((p->desc.outArrayType == rocfft_array_type_hermitian_interleaved)
+             || (p->desc.outArrayType == rocfft_array_type_hermitian_planar)))
+            return rocfft_status_invalid_array_type;
+        // In-place transform must output to interleaved format
+        if((placement == rocfft_placement_inplace)
+           && (p->desc.outArrayType != rocfft_array_type_hermitian_interleaved))
+            return rocfft_status_invalid_array_type;
+        break;
+    case rocfft_transform_type_real_inverse:
+        // Output must be real
+        if(p->desc.outArrayType != rocfft_array_type_real)
+            return rocfft_status_invalid_array_type;
+        // Input must be Hermitian
+        if(!((p->desc.inArrayType == rocfft_array_type_hermitian_interleaved)
+             || (p->desc.inArrayType == rocfft_array_type_hermitian_planar)))
+            return rocfft_status_invalid_array_type;
+        // In-place transform must have interleaved input
+        if((placement == rocfft_placement_inplace)
+           && (p->desc.inArrayType != rocfft_array_type_hermitian_interleaved))
+            return rocfft_status_invalid_array_type;
+        break;
     }
 
-    // Set in and out Distances, if not specified
-    if(p->desc.inDist == 0)
-    {
-        p->desc.inDist = p->lengths[p->rank - 1] * p->desc.inStrides[p->rank - 1];
-    }
-    if(p->desc.outDist == 0)
-    {
-        p->desc.outDist = p->lengths[p->rank - 1] * p->desc.outStrides[p->rank - 1];
-    }
+    // sort the parameters to be row major, in case they're not
+    plan->sort();
 
     log_bench(rocfft_rider_command(p));
 
@@ -469,19 +623,26 @@
         rootPlanData.rootIsC2C    = (rootPlanData.inArrayType != rocfft_array_type_real)
                                  && (rootPlanData.outArrayType != rocfft_array_type_real);
 
-        ExecPlan& execPlan = plan->execPlan;
-        int       deviceId = 0;
-        if(hipGetDevice(&deviceId) != hipSuccess)
-        {
-            throw std::runtime_error("hipGetDevice failed.");
-        }
-        if(hipGetDeviceProperties(&(execPlan.deviceProp), deviceId) != hipSuccess)
+        set_bluestein_strides(plan, rootPlanData);
+
+        ExecPlan& execPlan      = plan->execPlan;
+        execPlan.deviceProp     = get_curr_device_prop();
+        rootPlanData.deviceProp = execPlan.deviceProp;
+
+        execPlan.rootPlan = NodeFactory::CreateExplicitNode(rootPlanData, nullptr);
+
+        // If we are doing tuning initialzing now, we shouldn't apply any solution,
+        // since we are trying enumerating solutions now
+        if(TuningBenchmarker::GetSingleton().IsInitializingTuning() == false)
         {
-            throw std::runtime_error("hipGetDeviceProperties failed for deviceId "
-                                     + std::to_string(deviceId));
+            execPlan.rootScheme = ApplySolution(execPlan);
+            if(execPlan.rootScheme)
+            {
+                execPlan.rootPlan = nullptr;
+                execPlan.rootPlan = NodeFactory::CreateExplicitNode(
+                    rootPlanData, nullptr, execPlan.rootScheme->curScheme);
+            }
         }
-        rootPlanData.deviceProp = execPlan.deviceProp;
-        execPlan.rootPlan       = NodeFactory::CreateExplicitNode(rootPlanData, nullptr);
 
         std::copy(plan->lengths.begin(),
                   plan->lengths.begin() + plan->rank,
@@ -506,6 +667,16 @@
         // set scaling on the root plan
         execPlan.rootPlan->scale_factor = p->desc.scale_factor;
 
+        // check if we are doing tuning init now. If yes, we just return
+        // since we are not going to do the execution
+        if(TuningBenchmarker::GetSingleton().IsInitializingTuning())
+        {
+            EnumerateTrees(execPlan);
+            TuningBenchmarker::GetSingleton().GetPacket()->init_step = false;
+            TuningBenchmarker::GetSingleton().GetPacket()->is_tuning = true;
+            return rocfft_status_success;
+        }
+
         try
         {
             ProcessNode(execPlan); // TODO: more descriptions are needed
@@ -523,9 +694,17 @@
 
         if(!PlanPowX(execPlan)) // PlanPowX enqueues the GPU kernels by function
         {
-
             throw std::runtime_error("Unable to create execution plan.");
         }
+
+        // when running each solution during tuning, get the information to packet,
+        // then we can dump the information to a table for analysis
+        if(TuningBenchmarker::GetSingleton().IsProcessingTuning())
+        {
+            if(!GetTuningKernelInfo(execPlan))
+                throw std::runtime_error("Unable to get the solution info.");
+        }
+
         return rocfft_status_success;
     }
     catch(std::exception& e)
@@ -611,9 +790,7 @@
 {
     log_trace(__func__, "plan", plan);
     rocfft_cout << std::endl;
-    rocfft_cout << "precision: "
-                << ((plan->precision == rocfft_precision_single) ? "single" : "double")
-                << std::endl;
+    rocfft_cout << "precision: " << precision_name(plan->precision) << std::endl;
 
     rocfft_cout << "transform type: ";
     switch(plan->transformType)
@@ -750,15 +927,43 @@
     return rocfft_status_success;
 }
 
+// Compute the large twd decomposition base
+// 2-Steps:
+//  e.g., ( CeilPo2(10000)+ 1 ) / 2 , returns 7 : (2^7)*(2^7) = 16384 >= 10000
+// 3-Steps:
+//  e.g., ( CeilPo2(10000)+ 2 ) / 3 , returns 5 : (2^5)*(2^5)*(2^5) = 32768 >= 10000
+void get_large_twd_base_steps(size_t large1DLen, bool use3steps, size_t& base, size_t& steps)
+{
+    // use3steps, then 16^3 ~ 64^3, basically enough for 262144
+    // else, base is 8 (2^8 = 256), could be 2-steps 256^2 = 65536, if exceed, then is 256^3, and so on..
+    base = use3steps ? std::min((size_t)6, std::max((size_t)4, (CeilPo2(large1DLen) + 2) / 3)) : 8;
+
+    // but we still want to know the exact steps we will loop
+    steps                  = 0;
+    size_t lenLargeTwdBase = pow(2, base);
+    while(pow(lenLargeTwdBase, steps) < large1DLen)
+        steps++;
+
+    if(base == 8 && steps > 3)
+        throw std::runtime_error(
+            "large-twd-base 8 could be 2,3 steps, but not supported for 4-steps yet");
+    if(base < 8 && steps != 3)
+        throw std::runtime_error("large-twd-base for 4,5,6 must be 3-steps");
+}
+
 void TreeNode::CopyNodeData(const TreeNode& srcNode)
 {
     dimension       = srcNode.dimension;
     batch           = srcNode.batch;
     length          = srcNode.length;
     inStride        = srcNode.inStride;
+    inStrideBlue    = srcNode.inStrideBlue;
     outStride       = srcNode.outStride;
+    outStrideBlue   = srcNode.outStrideBlue;
     iDist           = srcNode.iDist;
+    iDistBlue       = srcNode.iDistBlue;
     oDist           = srcNode.oDist;
+    oDistBlue       = srcNode.oDistBlue;
     iOffset         = srcNode.iOffset;
     oOffset         = srcNode.oOffset;
     placement       = srcNode.placement;
@@ -775,6 +980,9 @@
     largeTwd3Steps = srcNode.largeTwd3Steps;
     largeTwdBase   = srcNode.largeTwdBase;
     lengthBlue     = srcNode.lengthBlue;
+    lengthBlueN    = srcNode.lengthBlueN;
+    typeBlue       = srcNode.typeBlue;
+    fuseBlue       = srcNode.fuseBlue;
 
     //
     obIn  = srcNode.obIn;
@@ -793,21 +1001,25 @@
 
 void TreeNode::CopyNodeData(const NodeMetaData& data)
 {
-    dimension    = data.dimension;
-    batch        = data.batch;
-    length       = data.length;
-    inStride     = data.inStride;
-    outStride    = data.outStride;
-    iDist        = data.iDist;
-    oDist        = data.oDist;
-    iOffset      = data.iOffset;
-    oOffset      = data.oOffset;
-    placement    = data.placement;
-    precision    = data.precision;
-    direction    = data.direction;
-    inArrayType  = data.inArrayType;
-    outArrayType = data.outArrayType;
-    deviceProp   = data.deviceProp;
+    dimension     = data.dimension;
+    batch         = data.batch;
+    length        = data.length;
+    inStride      = data.inStride;
+    inStrideBlue  = data.inStrideBlue;
+    outStride     = data.outStride;
+    outStrideBlue = data.outStrideBlue;
+    iDist         = data.iDist;
+    iDistBlue     = data.iDistBlue;
+    oDist         = data.oDist;
+    oDistBlue     = data.oDistBlue;
+    iOffset       = data.iOffset;
+    oOffset       = data.oOffset;
+    placement     = data.placement;
+    precision     = data.precision;
+    direction     = data.direction;
+    inArrayType   = data.inArrayType;
+    outArrayType  = data.outArrayType;
+    deviceProp    = data.deviceProp;
 }
 
 bool TreeNode::isPlacementAllowed(rocfft_result_placement test_placement) const
@@ -842,7 +1054,7 @@
 // That should be done in buffer assignment stage or
 // TraverseTreeAssignPlacementsLogicA().
 
-void TreeNode::RecursiveBuildTree()
+void TreeNode::RecursiveBuildTree(SchemeTree* solution_scheme)
 {
     // Some-Common-Work...
     // We must follow the placement of RootPlan, so needs to make it explicit
@@ -852,11 +1064,18 @@
         allowOutofplace = !allowInplace;
     }
 
+    SchemeVec child_schemes;
+    if(solution_scheme)
+    {
+        for(const auto& child : solution_scheme->children)
+            child_schemes.push_back(child->curScheme);
+    }
+
     // overriden by each derived class
-    BuildTree_internal();
+    BuildTree_internal(child_schemes);
 }
 
-void TreeNode::SanityCheck()
+void TreeNode::SanityCheck(SchemeTree* solution_scheme, std::vector<FMKey>& kernel_keys)
 {
     // no un-defined node is allowed in the tree
     if(nodeType == NT_UNDEFINED)
@@ -880,11 +1099,24 @@
     if(length.size() < dimension)
         throw std::runtime_error("not enough length[] for dimension");
 
+    // make sure the tree has the same decomposition way as in solution map
+    if(solution_scheme)
+    {
+        if(childNodes.size() != solution_scheme->children.size())
+            throw std::runtime_error("scheme-decomposition error: plan-tree != scheme-tree");
+        if(scheme != solution_scheme->curScheme)
+            throw std::runtime_error("scheme-decomposition error: node-scheme != solution-scheme");
+    }
+
     OperatingBuffer previousOut = obIn;
-    for(auto& child : childNodes)
+    for(size_t id = 0; id < childNodes.size(); ++id)
     {
+        auto&       child = childNodes[id];
+        SchemeTree* child_scheme
+            = (solution_scheme) ? solution_scheme->children[id].get() : nullptr;
+
         // 1. Recursively check child
-        child->SanityCheck();
+        child->SanityCheck(child_scheme, kernel_keys);
 
         // 2. Assert that the kernel chain is connected
         // Note: The Bluestein algorithm uses setup nodes that aren't
@@ -905,8 +1137,8 @@
 {
     if(function_pool::has_SBRC_kernel(length[0], precision))
     {
-        auto kernel
-            = function_pool::get_kernel(fpkey(length[0], precision, CS_KERNEL_STOCKHAM_BLOCK_RC));
+        auto kernel = function_pool::get_kernel(
+            fpkey(length[0], precision, CS_KERNEL_STOCKHAM_BLOCK_RC, TILE_ALIGNED));
         size_t bwd = kernel.transforms_per_block;
         if((length[1] >= bwd) && (length[2] >= bwd) && (length[1] * length[2] % bwd == 0))
             return true;
@@ -940,32 +1172,6 @@
     return false;
 }
 
-// Compute the large twd decomposition base
-// 2-Steps:
-//  e.g., ( CeilPo2(10000)+ 1 ) / 2 , returns 7 : (2^7)*(2^7) = 16384 >= 10000
-// 3-Steps:
-//  e.g., ( CeilPo2(10000)+ 2 ) / 3 , returns 5 : (2^5)*(2^5)*(2^5) = 32768 >= 10000
-void TreeNode::set_large_twd_base_steps(size_t largeTWDLength)
-{
-    // if is largeTwd3Steps, then 16^3 ~ 64^3, basically enough for 262144
-    // else, base is 8 (2^8 = 256), could be 2-steps 256^2 = 65536, if exceed, then is 256^3, and so on..
-    largeTwdBase = this->largeTwd3Steps
-                       ? std::min((size_t)6, std::max((size_t)4, (CeilPo2(largeTWDLength) + 2) / 3))
-                       : 8;
-
-    // but we still want to know the exact steps we will loop
-    ltwdSteps              = 0;
-    size_t lenLargeTwdBase = pow(2, largeTwdBase);
-    while(pow(lenLargeTwdBase, ltwdSteps) < largeTWDLength)
-        ltwdSteps++;
-
-    if(largeTwdBase == 8 && ltwdSteps > 3)
-        throw std::runtime_error(
-            "large-twd-base 8 could be 2,3 steps, but not supported for 4-steps yet");
-    if(largeTwdBase < 8 && ltwdSteps != 3)
-        throw std::runtime_error("large-twd-base for 4,5,6 must be 3-steps");
-}
-
 void TreeNode::ApplyFusion()
 {
     // Do the final fusion after the buffer assign is completed
@@ -1007,11 +1213,16 @@
                      ->get();
     auto last = childNodes.back().get();
 
-    this->obIn         = first->obIn;
-    this->obOut        = last->obOut;
-    this->placement    = (obIn == obOut) ? rocfft_placement_inplace : rocfft_placement_notinplace;
-    this->inArrayType  = first->inArrayType;
-    this->outArrayType = last->outArrayType;
+    // Skip first node in multi-kernel fused Bluestein
+    // since it is not connected to the buffer chain
+    if(fuseBlue != BFT_FWD_CHIRP)
+    {
+        this->obIn      = first->obIn;
+        this->obOut     = last->obOut;
+        this->placement = (obIn == obOut) ? rocfft_placement_inplace : rocfft_placement_notinplace;
+        this->inArrayType  = first->inArrayType;
+        this->outArrayType = last->outArrayType;
+    }
 }
 
 void TreeNode::AssignParams()
@@ -1022,7 +1233,9 @@
     for(auto& child : childNodes)
     {
         child->inStride.clear();
+        child->inStrideBlue.clear();
         child->outStride.clear();
+        child->outStrideBlue.clear();
     }
 
     AssignParams_internal();
@@ -1106,14 +1319,19 @@
 {
     if(nodeType == NT_LEAF)
     {
-        auto outputPtrDiff = compute_ptrdiff(
-            UseOutputLengthForPadding() ? GetOutputLength() : length, outStride, batch, oDist);
+        auto outputPtrDiff
+            = compute_ptrdiff(UseOutputLengthForPadding() ? GetOutputLength() : length,
+                              (typeBlue == BT_MULTI_KERNEL_FUSED) ? outStrideBlue : outStride,
+                              batch,
+                              (typeBlue == BT_MULTI_KERNEL_FUSED) ? oDistBlue : oDist);
 
         if(scheme == CS_KERNEL_CHIRP)
             chirpSize = std::max(lengthBlue, chirpSize);
 
         if(obOut == OB_TEMP_BLUESTEIN)
-            blueSize = std::max(outputPtrDiff, blueSize);
+            blueSize = std::max(typeBlue == BT_MULTI_KERNEL_FUSED ? outputPtrDiff + lengthBlue
+                                                                  : outputPtrDiff,
+                                blueSize);
 
         if(obOut == OB_TEMP_CMPLX_FOR_REAL)
             cmplxForRealSize = std::max(outputPtrDiff, cmplxForRealSize);
@@ -1133,12 +1351,12 @@
     while(i--)
         indentStr += "    ";
 
-    os << "\n" << indentStr.c_str() << "scheme: " << PrintScheme(scheme).c_str();
-    os << "\n" << indentStr.c_str();
+    os << "\n" << indentStr << "scheme: " << PrintScheme(scheme);
+    os << "\n" << indentStr;
     os << "dimension: " << dimension;
-    os << "\n" << indentStr.c_str();
+    os << "\n" << indentStr;
     os << "batch: " << batch;
-    os << "\n" << indentStr.c_str();
+    os << "\n" << indentStr;
     os << "length: ";
     for(size_t i = 0; i < length.size(); i++)
     {
@@ -1146,7 +1364,7 @@
     }
     if(!outputLength.empty())
     {
-        os << "\n" << indentStr.c_str();
+        os << "\n" << indentStr;
         os << "outputLength: ";
         for(size_t i = 0; i < outputLength.size(); i++)
         {
@@ -1154,119 +1372,102 @@
         }
     }
 
-    os << "\n" << indentStr.c_str() << "iStrides: ";
+    os << "\n" << indentStr << "iStrides: ";
     for(size_t i = 0; i < inStride.size(); i++)
         os << inStride[i] << " ";
 
-    os << "\n" << indentStr.c_str() << "oStrides: ";
+    if(typeBlue == BT_MULTI_KERNEL_FUSED)
+    {
+        os << "\n" << indentStr << "iStridesBlue: ";
+        for(size_t i = 0; i < inStrideBlue.size(); i++)
+            os << inStrideBlue[i] << " ";
+    }
+
+    os << "\n" << indentStr << "oStrides: ";
     for(size_t i = 0; i < outStride.size(); i++)
         os << outStride[i] << " ";
 
+    if(typeBlue == BT_MULTI_KERNEL_FUSED)
+    {
+        os << "\n" << indentStr << "oStridesBlue: ";
+        for(size_t i = 0; i < outStrideBlue.size(); i++)
+            os << outStrideBlue[i] << " ";
+    }
+
     if(iOffset)
     {
-        os << "\n" << indentStr.c_str();
+        os << "\n" << indentStr;
         os << "iOffset: " << iOffset;
     }
     if(oOffset)
     {
-        os << "\n" << indentStr.c_str();
+        os << "\n" << indentStr;
         os << "oOffset: " << oOffset;
     }
 
-    os << "\n" << indentStr.c_str();
+    os << "\n" << indentStr;
     os << "iDist: " << iDist;
-    os << "\n" << indentStr.c_str();
+    if(typeBlue == BT_MULTI_KERNEL_FUSED)
+    {
+        os << "\n" << indentStr;
+        os << "iDistBlue: " << iDistBlue;
+    }
+    os << "\n" << indentStr;
     os << "oDist: " << oDist;
+    if(typeBlue == BT_MULTI_KERNEL_FUSED)
+    {
+        os << "\n" << indentStr;
+        os << "oDistBlue: " << oDistBlue;
+    }
 
-    os << "\n" << indentStr.c_str();
+    os << "\n" << indentStr;
     os << "direction: " << direction;
 
-    os << "\n" << indentStr.c_str();
-    os << ((placement == rocfft_placement_inplace) ? "inplace" : "not inplace");
-
-    os << "\n" << indentStr.c_str();
+    os << "\n" << indentStr;
+    os << "placement: " << PrintPlacement(placement);
 
-    os << ((precision == rocfft_precision_single) ? "single-precision" : "double-precision");
+    os << "\n" << indentStr;
+    os << precision_name(precision) << "-precision";
 
-    os << std::endl << indentStr.c_str();
+    os << std::endl << indentStr;
     os << "array type: ";
-    switch(inArrayType)
-    {
-    case rocfft_array_type_complex_interleaved:
-        os << "complex interleaved";
-        break;
-    case rocfft_array_type_complex_planar:
-        os << "complex planar";
-        break;
-    case rocfft_array_type_real:
-        os << "real";
-        break;
-    case rocfft_array_type_hermitian_interleaved:
-        os << "hermitian interleaved";
-        break;
-    case rocfft_array_type_hermitian_planar:
-        os << "hermitian planar";
-        break;
-    default:
-        os << "unset";
-        break;
-    }
+    os << PrintArrayType(inArrayType);
     os << " -> ";
-    switch(outArrayType)
-    {
-    case rocfft_array_type_complex_interleaved:
-        os << "complex interleaved";
-        break;
-    case rocfft_array_type_complex_planar:
-        os << "complex planar";
-        break;
-    case rocfft_array_type_real:
-        os << "real";
-        break;
-    case rocfft_array_type_hermitian_interleaved:
-        os << "hermitian interleaved";
-        break;
-    case rocfft_array_type_hermitian_planar:
-        os << "hermitian planar";
-        break;
-    default:
-        os << "unset";
-        break;
-    }
+    os << PrintArrayType(outArrayType);
+
     if(large1D)
     {
-        os << "\n" << indentStr.c_str() << "large1D: " << large1D;
-        os << "\n" << indentStr.c_str() << "largeTwdBase: " << largeTwdBase;
-        os << "\n" << indentStr.c_str() << "largeTwdSteps: " << ltwdSteps;
+        os << "\n" << indentStr << "large1D: " << large1D;
+        os << "\n" << indentStr << "largeTwdBase: " << largeTwdBase;
+        os << "\n" << indentStr << "largeTwdSteps: " << ltwdSteps;
     }
     if(twiddles)
     {
         os << "\n"
-           << indentStr.c_str()
-           << "twiddle table length: " << twiddles_size / sizeof_precision(precision);
+           << indentStr << "twiddle table length: " << twiddles_size / complex_type_size(precision);
     }
     if(twiddles_large)
     {
         os << "\n"
-           << indentStr.c_str()
-           << "large twiddle table length: " << twiddles_large_size / sizeof_precision(precision);
+           << indentStr
+           << "large twiddle table length: " << twiddles_large_size / complex_type_size(precision);
     }
     if(lengthBlue)
-        os << "\n" << indentStr.c_str() << "lengthBlue: " << lengthBlue;
+        os << "\n" << indentStr << "lengthBlue: " << lengthBlue;
     os << "\n";
     switch(ebtype)
     {
     case EmbeddedType::NONE:
         break;
     case EmbeddedType::C2Real_PRE:
-        os << indentStr.c_str() << "EmbeddedType: C2Real_PRE\n";
+        os << indentStr << "EmbeddedType: C2Real_PRE\n";
         break;
     case EmbeddedType::Real2C_POST:
-        os << indentStr.c_str() << "EmbeddedType: Real2C_POST\n";
+        os << indentStr << "EmbeddedType: Real2C_POST\n";
         break;
     }
 
-    os << indentStr.c_str() << "SBRC_Trans_Type: " << PrintSBRCTransposeType(sbrcTranstype).c_str();
+    os << indentStr << "SBRC_Trans_Type: " << PrintSBRCTransposeType(sbrcTranstype);
     os << "\n";
 
     switch(intrinsicMode)
@@ -1274,15 +1475,14 @@
     case IntrinsicAccessType::DISABLE_BOTH:
         break;
     case IntrinsicAccessType::ENABLE_LOAD_ONLY:
-        os << indentStr.c_str() << "Intrinsic Mode: LOAD_ONLY\n";
+        os << indentStr << "Intrinsic Mode: LOAD_ONLY\n";
         break;
     case IntrinsicAccessType::ENABLE_BOTH:
-        os << indentStr.c_str() << "Intrinsic Mode: LOAD_AND_STORE\n";
+        os << indentStr << "Intrinsic Mode: LOAD_AND_STORE\n";
         break;
     }
 
-    os << indentStr.c_str()
-       << "Direct_to_from_Reg: " << PrintDirectToFromRegMode(dir2regMode).c_str();
+    os << indentStr << "Direct_to_from_Reg: " << PrintDirectToFromRegMode(dir2regMode);
     os << "\n";
     if(IsScalingEnabled())
         os << indentStr << "scale factor: " << scale_factor << "\n";
@@ -1305,6 +1505,31 @@
     std::cout << std::flush;
 }
 
+void TreeNode::RecursiveFindChildNodes(const ComputeScheme&    findScheme,
+                                       std::vector<TreeNode*>& nodes)
+{
+    if(scheme == findScheme)
+        nodes.emplace_back(this);
+
+    for(auto& child : childNodes)
+        child->RecursiveFindChildNodes(findScheme, nodes);
+}
+
+void TreeNode::RecursiveCopyNodeData(const TreeNode& srcNode)
+{
+    CopyNodeData(srcNode);
+
+    if(childNodes.size() != srcNode.childNodes.size())
+        throw std::runtime_error("Invalid copy of source tree data");
+
+    std::size_t i = 0;
+    for(auto& child : childNodes)
+    {
+        child->CopyNodeData(*srcNode.childNodes[i]);
+        ++i;
+    }
+}
+
 void TreeNode::RecursiveRemoveNode(TreeNode* node)
 {
     for(auto& child : childNodes)
@@ -1402,8 +1627,26 @@
 
 void RuntimeCompilePlan(ExecPlan& execPlan)
 {
-    for(auto& node : execPlan.execSeq)
-        node->compiledKernel = RTCKernel::runtime_compile(*node, execPlan.deviceProp.gcnArchName);
+    std::string kernel_name;
+    bool        is_tuning = TuningBenchmarker::GetSingleton().IsProcessingTuning();
+
+    for(size_t i = 0; i < execPlan.execSeq.size(); ++i)
+    {
+        auto& node = execPlan.execSeq[i];
+
+        node->compiledKernel
+            = RTCKernel::runtime_compile(*node, execPlan.deviceProp.gcnArchName, kernel_name);
+
+        // Log kernel name when tuning
+        if(is_tuning)
+        {
+            TuningBenchmarker::GetSingleton().GetPacket()->kernel_names[i] = kernel_name;
+            if(LOG_TUNING_ENABLED())
+                (*LogSingleton::GetInstance().GetTuningOS())
+                    << "kernel: " << kernel_name << std::endl;
+        }
+    }
+
     TreeNode* load_node             = nullptr;
     TreeNode* store_node            = nullptr;
     std::tie(load_node, store_node) = execPlan.get_load_store_nodes();
@@ -1412,15 +1655,16 @@
     bool need_callbacks = !array_type_is_planar(load_node->inArrayType)
                           && !array_type_is_planar(store_node->outArrayType);
 
-    if(need_callbacks)
+    // don't spend time compiling callback
+    if(need_callbacks && !is_tuning)
     {
-        load_node->compiledKernelWithCallbacks
-            = RTCKernel::runtime_compile(*load_node, execPlan.deviceProp.gcnArchName, true);
+        load_node->compiledKernelWithCallbacks = RTCKernel::runtime_compile(
+            *load_node, execPlan.deviceProp.gcnArchName, kernel_name, true);
 
         if(store_node != load_node)
         {
-            store_node->compiledKernelWithCallbacks
-                = RTCKernel::runtime_compile(*store_node, execPlan.deviceProp.gcnArchName, true);
+            store_node->compiledKernelWithCallbacks = RTCKernel::runtime_compile(
+                *store_node, execPlan.deviceProp.gcnArchName, kernel_name, true);
         }
     }
 
@@ -1436,9 +1680,227 @@
     }
 }
 
+// Input a node, get the representative prob-token as the key of solution-map
+void GetNodeToken(const TreeNode& probNode, std::string& min_token, std::string& full_token)
+{
+    // min_token: consider only length, precision, placement, complex/real,
+    //             and direction for real-trans (R2C/C2R)
+    // full_token: consider batch, dist, stride, offset, direction for complex
+    // When searching solution, looking for full-match first, and then min-match
+
+    // if this is a leaf-node TRANSPOSE, call_back or others with external-kernel = false
+    // currently we don't tune it, but still need to put an entry in the map. So we
+    // set a pre-defined token
+    if(probNode.isLeafNode() && probNode.GetKernelKey() == EmptyFMKey)
+    {
+        min_token = full_token = solution_map::LEAFNODE_TOKEN_BUILTIN_KERNEL;
+        return;
+    }
+
+    std::string token = ComputeSchemeIsAProblem(probNode.scheme)
+                            ? ("")
+                            : (PrintKernelSchemeAbbr(probNode.scheme) + "_");
+
+    for(size_t i = 0; i < probNode.dimension; ++i)
+        token += std::to_string(probNode.length[i]) + "_";
+
+    token += (probNode.precision == rocfft_precision_single) ? "sp_" : "dp_";
+    token += (probNode.placement == rocfft_placement_inplace) ? "ip_" : "op_";
+
+    bool is_real_trans = ((probNode.inArrayType == rocfft_array_type_real)
+                          || (probNode.outArrayType == rocfft_array_type_real));
+    bool is_fwd        = (probNode.direction == -1);
+
+    if(is_real_trans)
+    {
+        token += "real_";
+        token += (is_fwd) ? "fwd_" : "bwd_";
+        min_token = token;
+    }
+    else
+    {
+        token += "complex";
+        min_token = token;
+        token += (is_fwd) ? "_fwd_" : "_bwd_";
+    }
+
+    token += "batch_" + std::to_string(probNode.batch);
+    token += "_idist_" + std::to_string(probNode.iDist);
+    token += "_odist_" + std::to_string(probNode.oDist);
+
+    full_token = token;
+}
+
+// generate all possible keys from a root problem, try them all to find a solution.
+void GenerateProbKeys(const TreeNode& probNode, std::vector<ProblemKey>& possibleKeys)
+{
+    possibleKeys.clear();
+
+    std::string min_token;
+    std::string full_token;
+    std::string archName = get_arch_name(probNode.deviceProp);
+    GetNodeToken(probNode, min_token, full_token);
+
+    for(auto arch : {archName, std::string("any")})
+    {
+        for(auto prob_token : {full_token, min_token})
+        {
+            ProblemKey problemKey(arch, prob_token);
+            possibleKeys.push_back(problemKey);
+        }
+    }
+}
+
+// recursively apply the solutions (breadth-first)
+// return: A pointer of a sub-scheme-tree
+// If solution is a kernel, append the kernel_key to the output vector
+std::unique_ptr<SchemeTree>
+    RecursivelyApplySol(const ProblemKey& problemKey, ExecPlan& execPlan, size_t sol_option)
+{
+    auto& sol_map_single = solution_map::get_solution_map();
+    if(!sol_map_single.has_solution_node(problemKey, sol_option))
+        return nullptr;
+
+    std::string  arch     = problemKey.arch;
+    SolutionNode sol_node = sol_map_single.get_solution_node(problemKey, sol_option);
+
+    // it is a dummy solution.
+    if(sol_node.using_scheme == CS_NONE)
+    {
+        if(LOG_TRACE_ENABLED())
+            (*LogSingleton::GetInstance().GetTraceOS())
+                << "found a dummy root-solution(" << arch << ", " << problemKey.probToken << ")"
+                << std::endl;
+        return nullptr;
+    }
+
+    std::unique_ptr<SchemeTree> curScheme
+        = std::make_unique<SchemeTree>(SchemeTree(sol_node.using_scheme));
+
+    if(sol_node.sol_node_type == SOL_INTERNAL_NODE)
+    {
+        if(sol_node.solution_childnodes.empty())
+            return nullptr;
+
+        // we stick to the current arch same as the root's problemkey
+        // e.g even we are in gfx908, but if the found root solution is in "any" map,
+        // then we should keep looking-up the "any" map
+        for(auto& child_node : sol_node.solution_childnodes)
+        {
+            ProblemKey probKey(arch, child_node.child_token);
+            auto childScheme = RecursivelyApplySol(probKey, execPlan, child_node.child_option);
+            if(!childScheme)
+                return nullptr;
+
+            curScheme->numKernels += childScheme->numKernels;
+            curScheme->children.emplace_back(std::move(childScheme));
+        }
+    }
+    // SOL_LEAF_NODE
+    else if(sol_node.sol_node_type == SOL_LEAF_NODE)
+    {
+        // a leaf node should have exactly one child sol-node (SOL_KERNEL_ONLY or SOL_BUILTIN_KERNEL)
+        if(sol_node.solution_childnodes.size() != 1)
+            return nullptr;
+
+        std::string& kernel_token    = sol_node.solution_childnodes[0].child_token;
+        size_t       kernel_option   = sol_node.solution_childnodes[0].child_option;
+        bool         built_in_kernel = (kernel_token == solution_map::KERNEL_TOKEN_BUILTIN_KERNEL);
+
+        // When tuning, we're runing through each bench
+        // so we use the elaborated token (_leafnode_id_phase_id)
+        if(TuningBenchmarker::GetSingleton().IsProcessingTuning() && !built_in_kernel)
+        {
+            auto tuningPacket          = TuningBenchmarker::GetSingleton().GetPacket();
+            int  curr_tuning_node_id   = tuningPacket->tuning_node_id;
+            int  curr_tuning_phase     = tuningPacket->tuning_phase;
+            int  curr_tuning_config_id = tuningPacket->current_ssn;
+
+            // replacing the tuning target kernel_token to the candidate version
+            size_t cur_leaf_node_id = execPlan.solution_kernels.size();
+            kernel_token += "_leafnode_" + std::to_string(cur_leaf_node_id);
+
+            if(cur_leaf_node_id == (size_t)curr_tuning_node_id)
+            {
+                // if this kernel is the one we're tuning, then we set the testing phase and config_id
+                kernel_token += "_phase_" + std::to_string(curr_tuning_phase);
+                kernel_option = curr_tuning_config_id;
+            }
+            else
+            {
+                // if the kernel is not the tuning target: we should fix the kernel to the current winner
+                int curWinnerPhase = tuningPacket->winner_phases[cur_leaf_node_id];
+                int curWinnerID    = tuningPacket->winner_ids[cur_leaf_node_id];
+
+                kernel_token += "_phase_" + std::to_string(curWinnerPhase);
+                kernel_option = curWinnerID;
+            }
+        }
+
+        ProblemKey probKey_kernel(arch, kernel_token);
+        if(!sol_map_single.has_solution_node(probKey_kernel, kernel_option))
+            return nullptr;
+
+        // get the kernel of this leaf node, be sure to pick the right kernel option
+        SolutionNode kernel_node = sol_map_single.get_solution_node(probKey_kernel, kernel_option);
+        execPlan.solution_kernels.push_back(kernel_node.kernel_key);
+        curScheme->numKernels = 1;
+
+        if(LOG_TRACE_ENABLED())
+        {
+            (*LogSingleton::GetInstance().GetTraceOS())
+                << "found the kernel solution(" << arch << ", " << kernel_token
+                << ") with option: " << kernel_option << std::endl;
+        }
+    }
+    // we shouldn't handle any SOL_KERNEL_ONLY directly
+    else
+    {
+        throw std::runtime_error("Tree-Decomposition in solution map is invalid");
+        return nullptr;
+    }
+
+    // if here, means we've found valid solutions of all sub-probs
+    if(LOG_TRACE_ENABLED())
+    {
+        (*LogSingleton::GetInstance().GetTraceOS())
+            << "found solution for problemKey(" << problemKey.arch << ", " << problemKey.probToken
+            << ") with option: " << sol_option << std::endl;
+    }
+    if(LOG_TUNING_ENABLED())
+    {
+        (*LogSingleton::GetInstance().GetTuningOS())
+            << "[SolToken]: " << problemKey.probToken << std::endl;
+    }
+
+    return curScheme;
+}
+
+std::unique_ptr<SchemeTree> ApplySolution(ExecPlan& execPlan)
+{
+    std::vector<ProblemKey>     possibleKeys;
+    std::unique_ptr<SchemeTree> rootNodeScheme = nullptr;
+    GenerateProbKeys(*(execPlan.rootPlan), possibleKeys);
+
+    for(const auto& probKey : possibleKeys)
+    {
+        // found a valid solution-tree-decomposition
+        rootNodeScheme = RecursivelyApplySol(probKey, execPlan, 0);
+        if(rootNodeScheme)
+            break;
+
+        execPlan.solution_kernels = EmptyFMKeyVec;
+    }
+
+    return rootNodeScheme;
+}
+
 void ProcessNode(ExecPlan& execPlan)
 {
-    execPlan.rootPlan->RecursiveBuildTree();
+    SchemeTree* rootScheme = (execPlan.rootScheme) ? execPlan.rootScheme.get() : nullptr;
+    bool        noSolution = (rootScheme == nullptr);
+
+    execPlan.rootPlan->RecursiveBuildTree(rootScheme);
 
     assert(execPlan.rootPlan->length.size() == execPlan.rootPlan->dimension);
     assert(execPlan.rootPlan->length.size() == execPlan.rootPlan->inStride.size());
@@ -1446,8 +1908,12 @@
 
     // collect leaf-nodes to execSeq and fuseShims
     execPlan.rootPlan->CollectLeaves(execPlan.execSeq, execPlan.fuseShims);
-    CheckFuseShimForArch(execPlan);
-    OrderFuseShims(execPlan.execSeq, execPlan.fuseShims);
+
+    if(noSolution)
+    {
+        CheckFuseShimForArch(execPlan);
+        OrderFuseShims(execPlan.execSeq, execPlan.fuseShims);
+    }
 
     // initialize root plan input/output location if not already done
     if(execPlan.rootPlan->obOut == OB_UNINIT)
@@ -1465,11 +1931,14 @@
     AssignmentPolicy policy;
     policy.AssignBuffers(execPlan);
 
-    // Apply the fusion after buffer, strides are assigned
-    execPlan.rootPlan->ApplyFusion();
+    if(noSolution)
+    {
+        // Apply the fusion after buffer, strides are assigned
+        execPlan.rootPlan->ApplyFusion();
 
-    // collect the execSeq since we've fused some kernels
-    execPlan.rootPlan->CollectLeaves(execPlan.execSeq, execPlan.fuseShims);
+        // collect the execSeq since we've fused some kernels
+        execPlan.rootPlan->CollectLeaves(execPlan.execSeq, execPlan.fuseShims);
+    }
 
     // So we also need to update the whole tree including internal nodes
     // NB: The order matters: assign param -> fusion -> refresh internal node param
@@ -1482,7 +1951,28 @@
     execPlan.rootPlan->CollapseContiguousDims();
 
     // Check the buffer, param and tree integrity, Note we do this after fusion
-    execPlan.rootPlan->SanityCheck();
+    try
+    {
+        // rootScheme might be nullptr and solution_kernels might be empty (when no solution)
+        // if has solution, will also check if it's valid
+        execPlan.rootPlan->SanityCheck(rootScheme, execPlan.solution_kernels);
+    }
+    catch(const std::exception& e)
+    {
+        // When SanityCheck fails,
+        // if solution_kernels is empty or rootScheme is nullptr,
+        // means this is nothing to do with solution map. Throw to terminate
+        if(execPlan.solution_kernels.empty() || rootScheme == nullptr)
+            throw;
+        else
+        {
+            // data from solution map are invalid, then we're not able to use them
+            if(LOG_TRACE_ENABLED())
+                (*LogSingleton::GetInstance().GetTraceOS())
+                    << "input solution are invalid, try replacing kernels" << std::endl;
+            execPlan.rootPlan->SanityCheck();
+        }
+    }
 
     // get workBufSize..
     size_t tmpBufSize       = 0;
diff -Nru rocfft-5.5.0/library/src/powX.cpp rocfft-5.7.1/library/src/powX.cpp
--- rocfft-5.5.0/library/src/powX.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/powX.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +33,7 @@
 #include "plan.h"
 #include "rtc_kernel.h"
 #include "transform.h"
+#include "tuning_helper.h"
 
 #include "kernel_launch.h"
 
@@ -42,9 +43,11 @@
 #include "real2complex.h"
 
 #include "../../shared/environment.h"
+#include "../../shared/precision_type.h"
 #include "../../shared/printbuffer.h"
 #include "../../shared/ptrdiff.h"
-#include "rocfft_hip.h"
+#include "../../shared/rocfft_complex.h"
+#include "../../shared/rocfft_hip.h"
 
 // This function is called during creation of plan: enqueue the HIP kernels by function
 // pointers. Return true if everything goes well. Any internal device memory allocation
@@ -53,7 +56,7 @@
 {
     for(const auto& node : execPlan.execSeq)
     {
-        if(node->CreateTwiddleTableResource() == false)
+        if(node->CreateDeviceResources() == false)
             return false;
 
         if(node->CreateDevKernelArgs() == false)
@@ -88,15 +91,82 @@
     return true;
 }
 
+bool GetTuningKernelInfo(ExecPlan& execPlan)
+{
+    auto tuningPacket = TuningBenchmarker::GetSingleton().GetPacket();
+    if(!tuningPacket)
+        return false;
+
+    for(size_t i = 0; i < execPlan.execSeq.size(); ++i)
+    {
+        RTCKernel*   localCompiledKernel = execPlan.execSeq[i]->compiledKernel.get().get();
+        GridParam    gp                  = execPlan.gridParam[i];
+        FMKey        key                 = execPlan.execSeq[i]->GetKernelKey();
+        auto         lengths             = std::get<0>(key);
+        KernelConfig config              = std::get<4>(key);
+
+        // get occupancy: 0 means it's compiled (AOT)
+        //               -1 means failed on getting occupancy
+        // TODO- get occupancy of non-RTCKernel
+        int occupancy = 0;
+        if(localCompiledKernel)
+        {
+            // if queried occupancy = 0, which is very likely that this kernel
+            // can't be loaded
+            if(!localCompiledKernel->get_occupancy(
+                   {gp.wgs_x, gp.wgs_y, gp.wgs_z}, gp.lds_bytes, occupancy)
+               || occupancy == 0)
+                occupancy = -1;
+        }
+
+        // factors as string, we will output this to CSV file,
+        // and carry this from phase-0 to phase-1
+        std::string factors_str = "[";
+        std::string COMMA       = "";
+        for(auto factor : config.factors)
+        {
+            factors_str += COMMA + std::to_string(factor);
+            COMMA = ", ";
+        }
+        factors_str += "]";
+
+        // utilization info as string, we will output this to CSV file
+        std::stringstream util_ss;
+        util_ss << "[";
+        util_ss.precision(4);
+        float util_rate = 0.0f;
+        for(auto width : config.factors)
+        {
+            float height = static_cast<float>(lengths[0]) / width / config.threads_per_transform[0];
+            util_ss << std::fixed << height << ", ";
+
+            util_rate += height;
+        }
+        util_rate /= config.factors.size();
+        util_ss << std::fixed << util_rate << "]";
+
+        tuningPacket->num_of_blocks[i] = gp.b_x;
+        tuningPacket->lds_bytes[i]     = gp.lds_bytes;
+        tuningPacket->occupancy[i]     = occupancy;
+        tuningPacket->wgs[i]           = config.workgroup_size;
+        tuningPacket->tpt[i]           = config.threads_per_transform[0];
+        tuningPacket->tpb[i]           = config.transforms_per_block;
+        tuningPacket->util_rate[i]     = util_ss.str();
+        tuningPacket->factors_str[i]   = factors_str;
+    }
+
+    return true;
+}
+
 static size_t data_size_bytes(const std::vector<size_t>& lengths,
                               rocfft_precision           precision,
                               rocfft_array_type          type)
 {
     // first compute the raw number of elements
-    size_t elems = std::accumulate(
+    const size_t elems = std::accumulate(
         lengths.begin(), lengths.end(), static_cast<size_t>(1), std::multiplies<size_t>());
     // size of each element
-    size_t elemsize = (precision == rocfft_precision_single ? sizeof(float) : sizeof(double));
+    const size_t elemsize = real_type_size(precision);
     switch(type)
     {
     case rocfft_array_type_complex_interleaved:
@@ -173,7 +243,7 @@
 {
     const size_t size_elems = compute_ptrdiff(length_cm, stride_cm, batch, dist);
 
-    size_t base_type_size = (precision == rocfft_precision_double) ? sizeof(double) : sizeof(float);
+    size_t base_type_size = real_type_size(precision);
     if(type != rocfft_array_type_real)
     {
         // complex elements
@@ -186,14 +256,14 @@
     auto stride_rm = stride_cm;
     std::reverse(length_rm.begin(), length_rm.end());
     std::reverse(stride_rm.begin(), stride_rm.end());
-    std::vector<std::vector<char>> bufvec;
-    std::vector<size_t>            print_offset(2, offset);
+    std::vector<hostbuf> bufvec;
+    std::vector<size_t>  print_offset(2, offset);
     if(array_type_is_planar(type))
     {
         // separate the real/imag data, so printbuffer will print them separately
         bufvec.resize(2);
-        bufvec.front().resize(size_bytes / 2);
-        bufvec.back().resize(size_bytes / 2);
+        bufvec.front().alloc(size_bytes / 2);
+        bufvec.back().alloc(size_bytes / 2);
         if(hipMemcpy(bufvec.front().data(), buffer[0], size_bytes / 2, hipMemcpyDeviceToHost)
            != hipSuccess)
             throw std::runtime_error("hipMemcpy failure");
@@ -203,6 +273,12 @@
 
         switch(precision)
         {
+        case rocfft_precision_half:
+        {
+            buffer_printer<_Float16> s;
+            s.print_buffer(bufvec, length_rm, stride_rm, batch, dist, print_offset, stream);
+            break;
+        }
         case rocfft_precision_single:
         {
             buffer_printer<float> s;
@@ -220,13 +296,35 @@
     else
     {
         bufvec.resize(1);
-        bufvec.front().resize(size_bytes);
+        bufvec.front().alloc(size_bytes);
         if(hipMemcpy(bufvec.front().data(), buffer[0], size_bytes, hipMemcpyDeviceToHost)
            != hipSuccess)
             throw std::runtime_error("hipMemcpy failure");
 
         switch(precision)
         {
+        case rocfft_precision_half:
+        {
+            switch(type)
+            {
+            case rocfft_array_type_complex_interleaved:
+            case rocfft_array_type_hermitian_interleaved:
+            {
+                buffer_printer<rocfft_complex<_Float16>> s;
+                s.print_buffer(bufvec, length_rm, stride_rm, batch, dist, print_offset, stream);
+                break;
+            }
+            case rocfft_array_type_real:
+            {
+                buffer_printer<_Float16> s;
+                s.print_buffer(bufvec, length_rm, stride_rm, batch, dist, print_offset, stream);
+                break;
+            }
+            default:
+                throw std::runtime_error("invalid array format");
+            }
+            break;
+        }
         case rocfft_precision_single:
         {
             switch(type)
@@ -234,7 +332,7 @@
             case rocfft_array_type_complex_interleaved:
             case rocfft_array_type_hermitian_interleaved:
             {
-                buffer_printer<std::complex<float>> s;
+                buffer_printer<rocfft_complex<float>> s;
                 s.print_buffer(bufvec, length_rm, stride_rm, batch, dist, print_offset, stream);
                 break;
             }
@@ -256,7 +354,7 @@
             case rocfft_array_type_complex_interleaved:
             case rocfft_array_type_hermitian_interleaved:
             {
-                buffer_printer<std::complex<double>> s;
+                buffer_printer<rocfft_complex<double>> s;
                 s.print_buffer(bufvec, length_rm, stride_rm, batch, dist, print_offset, stream);
                 break;
             }
@@ -295,27 +393,69 @@
 
     if(is_complex && type == SetCallbackType::LOAD)
     {
-        result = (node->precision == rocfft_precision_single)
-                     ? hipMemcpyFromSymbol(cb, HIP_SYMBOL(load_cb_default_float2), sizeof(void*))
-                     : hipMemcpyFromSymbol(cb, HIP_SYMBOL(load_cb_default_double2), sizeof(void*));
+        switch(node->precision)
+        {
+        case rocfft_precision_half:
+            result
+                = hipMemcpyFromSymbol(cb, HIP_SYMBOL(load_cb_default_complex_half), sizeof(void*));
+            break;
+        case rocfft_precision_single:
+            result
+                = hipMemcpyFromSymbol(cb, HIP_SYMBOL(load_cb_default_complex_float), sizeof(void*));
+            break;
+        case rocfft_precision_double:
+            result = hipMemcpyFromSymbol(
+                cb, HIP_SYMBOL(load_cb_default_complex_double), sizeof(void*));
+            break;
+        }
     }
     else if(is_complex && type == SetCallbackType::STORE)
     {
-        result = (node->precision == rocfft_precision_single)
-                     ? hipMemcpyFromSymbol(cb, HIP_SYMBOL(store_cb_default_float2), sizeof(void*))
-                     : hipMemcpyFromSymbol(cb, HIP_SYMBOL(store_cb_default_double2), sizeof(void*));
+        switch(node->precision)
+        {
+        case rocfft_precision_half:
+            result
+                = hipMemcpyFromSymbol(cb, HIP_SYMBOL(store_cb_default_complex_half), sizeof(void*));
+            break;
+        case rocfft_precision_single:
+            result = hipMemcpyFromSymbol(
+                cb, HIP_SYMBOL(store_cb_default_complex_float), sizeof(void*));
+            break;
+        case rocfft_precision_double:
+            result = hipMemcpyFromSymbol(
+                cb, HIP_SYMBOL(store_cb_default_complex_double), sizeof(void*));
+            break;
+        }
     }
     else if(!is_complex && type == SetCallbackType::LOAD)
     {
-        result = (node->precision == rocfft_precision_single)
-                     ? hipMemcpyFromSymbol(cb, HIP_SYMBOL(load_cb_default_float), sizeof(void*))
-                     : hipMemcpyFromSymbol(cb, HIP_SYMBOL(load_cb_default_double), sizeof(void*));
+        switch(node->precision)
+        {
+        case rocfft_precision_half:
+            result = hipMemcpyFromSymbol(cb, HIP_SYMBOL(load_cb_default_half), sizeof(void*));
+            break;
+        case rocfft_precision_single:
+            result = hipMemcpyFromSymbol(cb, HIP_SYMBOL(load_cb_default_float), sizeof(void*));
+            break;
+        case rocfft_precision_double:
+            result = hipMemcpyFromSymbol(cb, HIP_SYMBOL(load_cb_default_double), sizeof(void*));
+            break;
+        }
     }
     else if(!is_complex && type == SetCallbackType::STORE)
     {
-        result = (node->precision == rocfft_precision_single)
-                     ? hipMemcpyFromSymbol(cb, HIP_SYMBOL(store_cb_default_float), sizeof(void*))
-                     : hipMemcpyFromSymbol(cb, HIP_SYMBOL(store_cb_default_double), sizeof(void*));
+        switch(node->precision)
+        {
+        case rocfft_precision_half:
+            result = hipMemcpyFromSymbol(cb, HIP_SYMBOL(store_cb_default_half), sizeof(void*));
+            break;
+        case rocfft_precision_single:
+            result = hipMemcpyFromSymbol(cb, HIP_SYMBOL(store_cb_default_float), sizeof(void*));
+            break;
+        case rocfft_precision_double:
+            result = hipMemcpyFromSymbol(cb, HIP_SYMBOL(store_cb_default_double), sizeof(void*));
+            break;
+        }
     }
 
     if(result != hipSuccess)
@@ -332,12 +472,15 @@
     assert(execPlan.execSeq.size() == execPlan.devFnCall.size());
     assert(execPlan.execSeq.size() == execPlan.gridParam.size());
 
+    bool processing_tuning = TuningBenchmarker::GetSingleton().IsProcessingTuning();
+    auto tuningPacket      = TuningBenchmarker::GetSingleton().GetPacket();
     // we can log profile information if we're on the null stream,
     // since we will be able to wait for the transform to finish
-    bool            emit_profile_log  = LOG_PROFILE_ENABLED() && !info->rocfft_stream;
-    bool            emit_kernelio_log = LOG_KERNELIO_ENABLED();
-    rocfft_ostream* kernelio_stream   = nullptr;
-    float           max_memory_bw     = 0.0;
+    bool emit_profile_log  = (processing_tuning || LOG_PROFILE_ENABLED()) && !info->rocfft_stream;
+    bool emit_kernelio_log = LOG_KERNELIO_ENABLED();
+
+    rocfft_ostream* kernelio_stream = nullptr;
+    float           max_memory_bw   = 0.0;
     hipEvent_t      start, stop;
     if(emit_profile_log)
     {
@@ -372,9 +515,7 @@
             data.log_func = nullptr;
 
         // Size of complex type
-        const size_t complexTSize = (data.node->precision == rocfft_precision_single)
-                                        ? sizeof(float) * 2
-                                        : sizeof(double) * 2;
+        const size_t complexTSize = complex_type_size(data.node->precision);
 
         switch(data.node->obIn)
         {
@@ -618,6 +759,9 @@
                 auto efficiency_pct = 0.0;
                 if(max_memory_bw != 0.0)
                     efficiency_pct = 100.0 * exec_bw / max_memory_bw;
+                if(processing_tuning)
+                    tuningPacket->bw_effs[i] = efficiency_pct;
+
                 log_profile(__func__,
                             "scheme",
                             PrintScheme(execPlan.execSeq[i]->scheme),
diff -Nru rocfft-5.5.0/library/src/repo.cpp rocfft-5.7.1/library/src/repo.cpp
--- rocfft-5.5.0/library/src/repo.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/repo.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 /******************************************************************************
-* Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+* Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -25,13 +25,13 @@
 #include <numeric>
 #include <vector>
 
+#include "chirp.h"
 #include "logging.h"
 #include "node_factory.h"
 #include "plan.h"
 #include "repo.h"
 #include "rocfft.h"
 #include "twiddles.h"
-
 // Implementation of Class Repo
 
 std::mutex        Repo::mtx;
@@ -74,6 +74,42 @@
 }
 
 template <typename KeyType>
+std::pair<void*, size_t>
+    Repo::GetChirpInternal(KeyType                                             key,
+                           std::map<KeyType, std::pair<gpubuf, unsigned int>>& chirp,
+                           std::map<void*, KeyType>&                           chirp_reverse,
+                           std::function<gpubuf(unsigned int)>                 create_chirp)
+{
+    if(repoDestroyed)
+    {
+        throw std::runtime_error("Repo prematurely destroyed.");
+    }
+
+    // see if the repo has already stored the plan or not
+    if(hipGetDevice(&key.deviceId) != hipSuccess)
+    {
+        throw std::runtime_error("hipGetDevice failed.");
+    }
+
+    auto it = chirp.find(key);
+    if(it != chirp.end())
+    {
+        // already had this length
+        it->second.second += 1;
+        return {it->second.first.data(), it->second.first.size()};
+    }
+
+    // otherwise, need to allocate
+    auto buf = create_chirp(key.deviceId);
+    // if allocation failed, don't update maps
+    if(buf.data() == nullptr)
+        return {nullptr, 0};
+    it = chirp.insert({key, std::make_pair(std::move(buf), 1)}).first;
+    chirp_reverse.insert({it->second.first.data(), key});
+    return {it->second.first.data(), it->second.first.size()};
+}
+
+template <typename KeyType>
 void Repo::ReleaseTwiddlesInternal(void*                                               ptr,
                                    std::map<KeyType, std::pair<gpubuf, unsigned int>>& twiddles,
                                    std::map<void*, KeyType>& twiddles_reverse)
@@ -102,9 +138,39 @@
     }
 }
 
+template <typename KeyType>
+void Repo::ReleaseChirpInternal(void*                                               ptr,
+                                std::map<KeyType, std::pair<gpubuf, unsigned int>>& chirp,
+                                std::map<void*, KeyType>&                           chirp_reverse)
+{
+    if(repoDestroyed)
+    {
+        throw std::runtime_error("Repo prematurely destroyed.");
+    }
+
+    auto reverse_it = chirp_reverse.find(ptr);
+    if(reverse_it == chirp_reverse.end())
+        return;
+    auto forward_it = chirp.find(reverse_it->second);
+    if(forward_it == chirp.end())
+    {
+        // orphaned reverse entry?
+        chirp_reverse.erase(reverse_it);
+        return;
+    }
+    forward_it->second.second -= 1;
+    if(forward_it->second.second == 0)
+    {
+        // remove from both maps
+        chirp.erase(forward_it);
+        chirp_reverse.erase(reverse_it);
+    }
+}
+
 std::pair<void*, size_t> Repo::GetTwiddles1D(size_t                     length,
                                              size_t                     length_limit,
                                              rocfft_precision           precision,
+                                             const char*                gpu_arch,
                                              size_t                     largeTwdBase,
                                              bool                       attach_halfN,
                                              const std::vector<size_t>& radices)
@@ -112,27 +178,47 @@
     std::lock_guard<std::mutex> lck(mtx);
     Repo&                       repo = Repo::GetRepo();
 
-    repo_key_1D_t key{length, length_limit, precision, largeTwdBase, attach_halfN, radices};
+    repo_twd_key_1D_t key{length, length_limit, precision, largeTwdBase, attach_halfN, radices};
     return GetTwiddlesInternal(
         key, repo.twiddles_1D, repo.twiddles_1D_reverse, [&](unsigned int deviceId) {
-            return twiddles_create(
-                length, length_limit, precision, largeTwdBase, attach_halfN, radices, deviceId);
+            return twiddles_create(length,
+                                   length_limit,
+                                   precision,
+                                   gpu_arch,
+                                   largeTwdBase,
+                                   attach_halfN,
+                                   radices,
+                                   deviceId);
         });
 }
 
-std::pair<void*, size_t>
-    Repo::GetTwiddles2D(size_t length0, size_t length1, rocfft_precision precision)
+std::pair<void*, size_t> Repo::GetTwiddles2D(size_t           length0,
+                                             size_t           length1,
+                                             rocfft_precision precision,
+                                             const char*      gpu_arch)
 {
     std::lock_guard<std::mutex> lck(mtx);
     Repo&                       repo = Repo::GetRepo();
 
-    repo_key_2D_t key{length0, length1, precision};
+    repo_twd_key_2D_t key{length0, length1, precision};
     return GetTwiddlesInternal(
         key, repo.twiddles_2D, repo.twiddles_2D_reverse, [&](unsigned int deviceId) {
-            return twiddles_create_2D(length0, length1, precision, deviceId);
+            return twiddles_create_2D(length0, length1, precision, gpu_arch, deviceId);
         });
 }
 
+std::pair<void*, size_t>
+    Repo::GetChirp(size_t length, rocfft_precision precision, const char* gpu_arch)
+{
+    std::lock_guard<std::mutex> lck(mtx);
+    Repo&                       repo = Repo::GetRepo();
+
+    repo_chirp_key_t key{length, precision};
+    return GetChirpInternal(key, repo.chirp, repo.chirp_reverse, [&](unsigned int deviceId) {
+        return chirp_create(length, precision, gpu_arch, deviceId);
+    });
+}
+
 void Repo::ReleaseTwiddle1D(void* ptr)
 {
     std::lock_guard<std::mutex> lck(mtx);
@@ -149,13 +235,25 @@
     return ReleaseTwiddlesInternal(ptr, repo.twiddles_2D, repo.twiddles_2D_reverse);
 }
 
+void Repo::ReleaseChirp(void* ptr)
+{
+    std::lock_guard<std::mutex> lck(mtx);
+
+    Repo& repo = Repo::GetRepo();
+    return ReleaseChirpInternal(ptr, repo.chirp, repo.chirp_reverse);
+}
+
 void Repo::Clear()
 {
     std::lock_guard<std::mutex> lck(mtx);
     if(repoDestroyed)
         return;
     Repo& repo = Repo::GetRepo();
+
     repo.twiddles_1D.clear();
     repo.twiddles_2D.clear();
     twiddle_streams_cleanup();
+
+    repo.chirp.clear();
+    chirp_streams_cleanup();
 }
diff -Nru rocfft-5.5.0/library/src/rocfft_aot_helper.cpp rocfft-5.7.1/library/src/rocfft_aot_helper.cpp
--- rocfft-5.5.0/library/src/rocfft_aot_helper.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/rocfft_aot_helper.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -31,6 +31,7 @@
 #include "rtc_cache.h"
 #include "rtc_realcomplex_gen.h"
 #include "rtc_stockham_gen.h"
+#include "rtc_twiddle_gen.h"
 
 #include "device/kernel-generator-embed.h"
 
@@ -133,18 +134,15 @@
         unitstride_range = {false};
 
         base_steps.resize(1);
-        // All SBRCs have TILE_UNALIGNED
-        sbrc_trans_types.push_back(SBRC_TRANSPOSE_TYPE::TILE_UNALIGNED);
-        // Finish SBRC-2D
-        if(scheme == CS_KERNEL_STOCKHAM_BLOCK_RC)
-            break;
-        // All 3D SBRCs have TILE_ALIGNED, but "NO" SBRC_TRANSPOSE_TYPE::NONE
-        sbrc_trans_types.push_back(SBRC_TRANSPOSE_TYPE::TILE_ALIGNED);
+        // All SBRCs have ALIGNED and UNALIGNED, but no NONE
         sbrc_trans_types.erase(sbrc_trans_types.begin());
-        // Finish ERC
-        if(scheme == CS_KERNEL_STOCKHAM_R_TO_CMPLX_TRANSPOSE_Z_XY)
+        sbrc_trans_types.push_back(SBRC_TRANSPOSE_TYPE::TILE_ALIGNED);
+        sbrc_trans_types.push_back(SBRC_TRANSPOSE_TYPE::TILE_UNALIGNED);
+        // Finish SBRC-2D and SBRC-3D-ERC without DIAGONAL
+        if(scheme == CS_KERNEL_STOCKHAM_BLOCK_RC
+           || scheme == CS_KERNEL_STOCKHAM_R_TO_CMPLX_TRANSPOSE_Z_XY)
             break;
-        // DIAGNAL Transpose
+        // DIAGONAL Transpose
         sbrc_trans_types.push_back(SBRC_TRANSPOSE_TYPE::DIAGONAL);
 
         break;
@@ -227,6 +225,9 @@
     // scaling Stockham kernels are always built at runtime
     const bool enable_scaling = false;
 
+    // fused Bluestein kernels are also always built at runtime
+    auto fuseBlue = BluesteinFuseType::BFT_NONE;
+
     for(const auto& i : fp.get_map())
     {
         // we only want to compile kernels explicitly marked for AOT RTC
@@ -240,6 +241,15 @@
         std::vector<unsigned int> factors;
         std::copy(i.second.factors.begin(), i.second.factors.end(), std::back_inserter(factors));
 
+        StockhamGeneratorSpecs specs{factors,
+                                     {},
+                                     {static_cast<unsigned int>(precision)},
+                                     static_cast<unsigned int>(i.second.workgroup_size),
+                                     PrintScheme(scheme)};
+        specs.threads_per_transform = i.second.threads_per_transform[0];
+        specs.half_lds              = i.second.half_lds;
+        specs.direct_to_from_reg    = i.second.direct_to_from_reg;
+
         stockham_combo(scheme,
                        i.second,
                        [=, &queue](int                     direction,
@@ -274,10 +284,9 @@
                                    return;
                            }
 
-                           auto kernel_name = stockham_rtc_kernel_name(scheme,
-                                                                       length1D,
-                                                                       0,
-                                                                       0,
+                           auto kernel_name = stockham_rtc_kernel_name(specs,
+                                                                       specs,
+                                                                       scheme,
                                                                        direction,
                                                                        precision,
                                                                        placement,
@@ -292,7 +301,8 @@
                                                                        intrinsic,
                                                                        sbrc_trans_type,
                                                                        callbacks,
-                                                                       enable_scaling);
+                                                                       enable_scaling,
+                                                                       fuseBlue);
                            std::function<std::string(const std::string&)> generate_src
                                = [=](const std::string& kernel_name) -> std::string {
                                StockhamGeneratorSpecs specs{
@@ -323,7 +333,8 @@
                                                    intrinsic,
                                                    sbrc_trans_type,
                                                    callbacks,
-                                                   enable_scaling);
+                                                   enable_scaling,
+                                                   fuseBlue);
                            };
                            queue.push({kernel_name, generate_src});
                        });
@@ -415,6 +426,28 @@
     }
 }
 
+void build_twiddle(CompileQueue& queue)
+{
+    const auto twiddle_kernel_types = {
+        TwiddleTableType::RADICES,
+        TwiddleTableType::LENGTH_N,
+        TwiddleTableType::HALF_N,
+        TwiddleTableType::LARGE,
+    };
+    for(auto precision : {rocfft_precision_single, rocfft_precision_double})
+    {
+        for(auto type : twiddle_kernel_types)
+        {
+            auto kernel_name = twiddle_rtc_kernel_name(type, precision);
+            std::function<std::string(const std::string&)> generate_src
+                = [=](const std::string& kernel_name) -> std::string {
+                return twiddle_rtc(kernel_name, type, precision);
+            };
+            queue.push({kernel_name, generate_src});
+        }
+    }
+}
+
 int main(int argc, char** argv)
 {
     if(argc < 5)
@@ -474,6 +507,7 @@
     build_stockham_function_pool(queue);
     build_realcomplex(queue);
     build_apply_callback(queue);
+    build_twiddle(queue);
 
     // signal end of results with empty work items
     for(size_t i = 0; i < NUM_THREADS; ++i)
diff -Nru rocfft-5.5.0/library/src/rocfft_config_search.cpp rocfft-5.7.1/library/src/rocfft_config_search.cpp
--- rocfft-5.5.0/library/src/rocfft_config_search.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/rocfft_config_search.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -169,22 +169,23 @@
                         IntrinsicAccessType::DISABLE_BOTH,
                         SBRC_TRANSPOSE_TYPE::NONE,
                         false,
-                        false);
+                        false,
+                        BluesteinFuseType::BFT_NONE);
 }
 
 // things that we need to remember between kernel launches
 struct device_data_t
 {
-    std::vector<float2> host_input_buf;
-    gpubuf_t<float2>    fake_twiddles;
-    gpubuf_t<float2>    input_buf;
-    gpubuf_t<float2>    output_buf;
-    gpubuf_t<size_t>    lengths;
-    gpubuf_t<size_t>    stride_in;
-    gpubuf_t<size_t>    stride_out;
-    size_t              batch;
-    hipEvent_t          start;
-    hipEvent_t          stop;
+    std::vector<rocfft_complex<float>> host_input_buf;
+    gpubuf_t<rocfft_complex<float>>    fake_twiddles;
+    gpubuf_t<rocfft_complex<float>>    input_buf;
+    gpubuf_t<rocfft_complex<float>>    output_buf;
+    gpubuf_t<size_t>                   lengths;
+    gpubuf_t<size_t>                   stride_in;
+    gpubuf_t<size_t>                   stride_out;
+    size_t                             batch;
+    hipEvent_t                         start;
+    hipEvent_t                         stop;
 
     device_data_t()
     {
@@ -227,7 +228,7 @@
         // before each execution
         if(hipMemcpy(data.input_buf.data(),
                      data.host_input_buf.data(),
-                     data.host_input_buf.size() * sizeof(float2),
+                     data.host_input_buf.size() * sizeof(rocfft_complex<float>),
                      hipMemcpyHostToDevice)
            != hipSuccess)
             throw std::runtime_error("failed to hipMemcpy");
@@ -251,7 +252,7 @@
 unsigned int get_lds_bytes(unsigned int length, unsigned int transforms_per_block, bool half_lds)
 {
     // assume single precision complex
-    return length * transforms_per_block * sizeof(float2) / (half_lds ? 2 : 1);
+    return length * transforms_per_block * sizeof(rocfft_complex<float>) / (half_lds ? 2 : 1);
 }
 
 size_t batch_size(unsigned int length)
@@ -262,10 +263,10 @@
     return target_elems / length;
 }
 
-std::vector<float2> create_input_buf(unsigned int length, size_t batch)
+std::vector<rocfft_complex<float>> create_input_buf(unsigned int length, size_t batch)
 {
-    auto                elems = length * batch;
-    std::vector<float2> buf;
+    auto                               elems = length * batch;
+    std::vector<rocfft_complex<float>> buf;
     buf.reserve(elems);
     std::mt19937 gen;
     for(unsigned int i = 0; i < elems; ++i)
@@ -277,13 +278,13 @@
     return buf;
 }
 
-gpubuf_t<float2> create_device_buf(unsigned int length, size_t batch)
+gpubuf_t<rocfft_complex<float>> create_device_buf(unsigned int length, size_t batch)
 {
-    auto             elems = length * batch;
-    gpubuf_t<float2> device_buf;
-    if(device_buf.alloc(elems * sizeof(float2)) != hipSuccess)
+    auto                            elems = length * batch;
+    gpubuf_t<rocfft_complex<float>> device_buf;
+    if(device_buf.alloc(elems * sizeof(rocfft_complex<float>)) != hipSuccess)
         throw std::runtime_error("failed to hipMalloc");
-    if(hipMemset(device_buf.data(), 0, elems * sizeof(float2)) != hipSuccess)
+    if(hipMemset(device_buf.data(), 0, elems * sizeof(rocfft_complex<float>)) != hipSuccess)
         throw std::runtime_error("failed to hipMemset");
 
     return device_buf;
@@ -343,7 +344,7 @@
     data.fake_twiddles = create_device_buf(length, 1);
     if(hipMemcpy(data.fake_twiddles.data(),
                  host_twiddles.data(),
-                 host_twiddles.size() * sizeof(float2),
+                 host_twiddles.size() * sizeof(rocfft_complex<float>),
                  hipMemcpyHostToDevice)
        != hipSuccess)
         throw std::runtime_error("failed to hipMemcpy");
diff -Nru rocfft-5.5.0/library/src/rocfft_offline_tuner.cpp rocfft-5.7.1/library/src/rocfft_offline_tuner.cpp
--- rocfft-5.5.0/library/src/rocfft_offline_tuner.cpp	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/rocfft_offline_tuner.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,553 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include <cmath>
+#include <cstddef>
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "../../shared/environment.h"
+#include "../../shared/gpubuf.h"
+#include "../../shared/rocfft_params.h"
+#include "option_util.h"
+#include "rocfft.h"
+#include "tuning_helper.h"
+
+inline void
+    hip_V_Throw(hipError_t res, const std::string& msg, size_t lineno, const std::string& fileName)
+{
+    if(res != hipSuccess)
+    {
+        std::stringstream tmp;
+        tmp << "HIP_V_THROWERROR< ";
+        tmp << res;
+        tmp << " > (";
+        tmp << fileName;
+        tmp << " Line: ";
+        tmp << lineno;
+        tmp << "): ";
+        tmp << msg;
+        std::string errorm(tmp.str());
+        std::cout << errorm << std::endl;
+        throw std::runtime_error(errorm);
+    }
+}
+
+inline void
+    lib_V_Throw(fft_status res, const std::string& msg, size_t lineno, const std::string& fileName)
+{
+    if(res != fft_status_success)
+    {
+        std::stringstream tmp;
+        tmp << "LIB_V_THROWERROR< ";
+        tmp << res;
+        tmp << " > (";
+        tmp << fileName;
+        tmp << " Line: ";
+        tmp << lineno;
+        tmp << "): ";
+        tmp << msg;
+        std::string errorm(tmp.str());
+        std::cout << errorm << std::endl;
+        throw std::runtime_error(errorm);
+    }
+}
+
+#define HIP_V_THROW(_status, _message) hip_V_Throw(_status, _message, __LINE__, __FILE__)
+#define LIB_V_THROW(_status, _message) lib_V_Throw(_status, _message, __LINE__, __FILE__)
+
+static const int command_tuning  = 0;
+static const int command_merging = 1;
+
+int merge_solutions(const std::string& base_filename,
+                    const std::string& new_filename,
+                    const std::string& probKey,
+                    const std::string& out_filename)
+{
+    // don't use anything from solutions.cpp
+    rocfft_setenv("ROCFFT_USE_EMPTY_SOL_MAP", "1");
+
+    rocfft_setup();
+
+    // create tuning parameters
+    TuningBenchmarker* offline_tuner = nullptr;
+    rocfft_get_offline_tuner_handle((void**)(&offline_tuner));
+
+    // Manupulating the solution map from tuner...
+    bool merge_result
+        = offline_tuner->MergingSolutionsMaps(base_filename, new_filename, probKey, out_filename);
+
+    rocfft_cleanup();
+
+    if(!merge_result)
+    {
+        std::cout << "Merge Solutions Failed" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    return EXIT_SUCCESS;
+}
+
+int offline_tune_problems(rocfft_params& params, int verbose, int ntrial)
+{
+    // don't use anything from solutions.cpp
+    rocfft_setenv("ROCFFT_USE_EMPTY_SOL_MAP", "1");
+
+    rocfft_setup();
+
+    params.validate();
+
+    if(!params.valid(verbose))
+        throw std::runtime_error("Invalid parameters, add --verbose=1 for detail");
+    if(verbose)
+        std::cout << params.str(" ") << std::endl;
+
+    std::cout << "Token: " << params.token() << std::endl;
+
+    // create tuning parameters
+    TuningBenchmarker* offline_tuner = nullptr;
+    rocfft_get_offline_tuner_handle((void**)(&offline_tuner));
+
+    // first time call create_plan is actually generating a bunch of combination of configs
+    offline_tuner->SetInitStep(0);
+
+    // Check free and total available memory:
+    size_t free  = 0;
+    size_t total = 0;
+    HIP_V_THROW(hipMemGetInfo(&free, &total), "hipMemGetInfo failed");
+    const auto raw_vram_footprint
+        = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params);
+    if(!vram_fits_problem(raw_vram_footprint, free))
+    {
+        std::cout << "SKIPPED: Problem size (" << raw_vram_footprint
+                  << ") raw data too large for device.\n";
+        return EXIT_SUCCESS;
+    }
+
+    const auto vram_footprint = params.vram_footprint();
+    if(!vram_fits_problem(vram_footprint, free))
+    {
+        std::cout << "SKIPPED: Problem size (" << vram_footprint
+                  << ") raw data too large for device.\n";
+        return EXIT_SUCCESS;
+    }
+
+    LIB_V_THROW(params.create_plan(), "Plan creation failed");
+
+    // GPU input buffer:
+    auto                ibuffer_sizes = params.ibuffer_sizes();
+    std::vector<gpubuf> ibuffer(ibuffer_sizes.size());
+    std::vector<void*>  pibuffer(ibuffer_sizes.size());
+    for(unsigned int i = 0; i < ibuffer.size(); ++i)
+    {
+        HIP_V_THROW(ibuffer[i].alloc(ibuffer_sizes[i]), "Creating input Buffer failed");
+        pibuffer[i] = ibuffer[i].data();
+    }
+
+    // Input data:
+    params.compute_input(ibuffer);
+
+    // GPU output buffer:
+    std::vector<gpubuf>  obuffer_data;
+    std::vector<gpubuf>* obuffer = &obuffer_data;
+    if(params.placement == fft_placement_inplace)
+    {
+        obuffer = &ibuffer;
+    }
+    else
+    {
+        auto obuffer_sizes = params.obuffer_sizes();
+        obuffer_data.resize(obuffer_sizes.size());
+        for(unsigned int i = 0; i < obuffer_data.size(); ++i)
+        {
+            HIP_V_THROW(obuffer_data[i].alloc(obuffer_sizes[i]), "Creating output Buffer failed");
+        }
+    }
+    std::vector<void*> pobuffer(obuffer->size());
+    for(unsigned int i = 0; i < obuffer->size(); ++i)
+    {
+        pobuffer[i] = obuffer->at(i).data();
+    }
+
+    // finish initialization, solution map now contains all the candidates
+    // start doing real benchmark with different configurations
+    int num_nodes = offline_tuner->UpdateNumOfTuningNodes();
+    if(num_nodes == 0)
+    {
+        std::cout << "[Result]: This fft problem hasn't been supported yet. (Prime number or "
+                     "real-transform)"
+                  << std::endl;
+        rocfft_cleanup();
+        return EXIT_FAILURE;
+    }
+
+    std::vector<int>         winner_phases = std::vector<int>(num_nodes, 0);
+    std::vector<int>         winner_ids    = std::vector<int>(num_nodes, 0);
+    std::vector<std::string> kernels       = std::vector<std::string>(num_nodes, "");
+
+    static const int TUNING_PHASE   = 2;
+    double           curr_best_msec = std::numeric_limits<double>().max();
+
+    // calculate this once only
+    const double totsize
+        = std::accumulate(params.length.begin(), params.length.end(), 1, std::multiplies<size_t>());
+    const double k
+        = ((params.itype == fft_array_type_real) || (params.otype == fft_array_type_real)) ? 2.5
+                                                                                           : 5.0;
+    const double opscount = (double)params.nbatch * k * totsize * log(totsize) / log(2.0);
+
+    for(int curr_phase = 0; curr_phase < TUNING_PHASE; ++curr_phase)
+    {
+        if(curr_phase > 0)
+        {
+            // SET TARGET_FACTOR and current PHASE
+            offline_tuner->SetInitStep(curr_phase);
+
+            // make sure we can re-create the plan
+            params.free();
+
+            LIB_V_THROW(params.create_plan(), "Plan creation failed");
+        }
+
+        // keeping creating plan
+        for(int node_id = 0; node_id < num_nodes; ++node_id)
+        {
+            std::string winner_name;
+            int         winner_phase;
+            int         winner_id;
+            int         num_benchmarks = offline_tuner->GetNumOfKernelCandidates(node_id);
+
+            offline_tuner->SetCurrentTuningNodeId(node_id);
+            for(int ssn = 0; ssn < num_benchmarks; ++ssn)
+            {
+                offline_tuner->SetCurrentKernelCandidateId(ssn);
+                std::cout << "\nTuning for node " << node_id << "/" << (num_nodes - 1)
+                          << ", tuning phase :" << curr_phase << "/" << (TUNING_PHASE - 1)
+                          << ", config :" << ssn << "/" << (num_benchmarks - 1) << std::endl;
+
+                // make sure we can re-create the plan
+                params.free();
+
+                LIB_V_THROW(params.create_plan(), "Plan creation failed");
+
+                // skip low occupancy test...simple output gflops 0
+                BenchmarkInfo info = offline_tuner->GetCurrBenchmarkInfo();
+                if(info.occupancy == 1 || info.occupancy < 0)
+                {
+                    std::cout << "\nOccupancy 1 or -1, Skipped" << std::endl;
+                    offline_tuner->UpdateCurrBenchResult(0, 0);
+                    continue;
+                }
+
+                params.execute(pibuffer.data(), pobuffer.data());
+
+                // Run the transform several times and record the execution time:
+                std::vector<double> gpu_time(ntrial);
+
+                hipEvent_t start, stop;
+                HIP_V_THROW(hipEventCreate(&start), "hipEventCreate failed");
+                HIP_V_THROW(hipEventCreate(&stop), "hipEventCreate failed");
+                for(unsigned int itrial = 0; itrial < gpu_time.size(); ++itrial)
+                {
+                    HIP_V_THROW(hipEventRecord(start), "hipEventRecord failed");
+
+                    params.execute(pibuffer.data(), pobuffer.data());
+
+                    HIP_V_THROW(hipEventRecord(stop), "hipEventRecord failed");
+                    HIP_V_THROW(hipEventSynchronize(stop), "hipEventSynchronize failed");
+
+                    float time;
+                    HIP_V_THROW(hipEventElapsedTime(&time, start, stop),
+                                "hipEventElapsedTime failed");
+                    gpu_time[itrial] = time;
+                }
+
+                std::cout << "Execution gpu time:";
+                for(const auto& i : gpu_time)
+                {
+                    std::cout << " " << i;
+                }
+                std::cout << " ms" << std::endl;
+
+                std::cout << "Execution gflops:  ";
+                for(const auto& i : gpu_time)
+                {
+                    double gflops = opscount / (1e6 * i);
+                    std::cout << " " << gflops;
+                }
+                std::cout << std::endl;
+                HIP_V_THROW(hipEventDestroy(start), "hipEventDestroy failed");
+                HIP_V_THROW(hipEventDestroy(stop), "hipEventDestroy failed");
+
+                // get median, if odd, get middle one, else get avg(middle twos)
+                std::sort(gpu_time.begin(), gpu_time.end());
+                double ms_median
+                    = (gpu_time.size() % 2 == 1)
+                          ? gpu_time[gpu_time.size() / 2]
+                          : (gpu_time[gpu_time.size() / 2] + gpu_time[gpu_time.size() / 2 - 1]) / 2;
+                double gflops_median = opscount / (1e6 * ms_median);
+
+                offline_tuner->UpdateCurrBenchResult(ms_median, gflops_median);
+            }
+
+            offline_tuner->FindWinnerForCurrNode(
+                curr_best_msec, winner_phase, winner_id, winner_name);
+            std::cout << "\n[UP_TO_PHASE_" << curr_phase << "_RESULT]:" << std::endl;
+            std::cout << "\n[BEST_KERNEL]: In Phase: " << winner_phase
+                      << ", Config ID: " << winner_id << std::endl;
+
+            // update the latest winner info
+            winner_phases[node_id] = winner_phase;
+            winner_ids[node_id]    = winner_id;
+            kernels[node_id]       = winner_name;
+
+            bool is_last_phase = (curr_phase == TUNING_PHASE - 1);
+            bool is_last_node  = (node_id == num_nodes - 1);
+
+            // output data of this turn to csv
+            if(!offline_tuner->ExportCSV(node_id > 0 || curr_phase > 0))
+                std::cout << "Write CSV Failed." << std::endl;
+
+            // pass the target factors to next phase with permutation
+            if(!is_last_phase)
+                offline_tuner->PropagateBestFactorsToNextPhase();
+
+            // in last phase and last node: finished tuning
+            // export to file (output the winner solutions to solution map)
+            if(is_last_phase && is_last_node)
+                offline_tuner->ExportWinnerToSolutions();
+        }
+    }
+
+    std::string out_path;
+    offline_tuner->GetOutputSolutionMapPath(out_path);
+
+    std::cout << "\n[OUTPUT_FILE]: " << out_path << std::endl;
+    std::cout << "\n[BEST_SOLUTION]: " << params.token() << std::endl;
+    for(int node_id = 0; node_id < num_nodes; ++node_id)
+    {
+        std::cout << "[Result]: Node " << node_id << ":" << std::endl;
+        std::cout << "[Result]:     in phase   : " << winner_phases[node_id] << std::endl;
+        std::cout << "[Result]:     best config: " << winner_ids[node_id] << std::endl;
+        std::cout << "[Result]:     kernel name: " << kernels[node_id] << std::endl;
+    }
+    double best_gflops = opscount / (1e6 * curr_best_msec);
+    std::cout << "[Result]: GPU Time: " << curr_best_msec << std::endl;
+    std::cout << "[Result]: GFLOPS: " << best_gflops << std::endl;
+
+    rocfft_cleanup();
+
+    return EXIT_SUCCESS;
+}
+
+int main(int argc, char* argv[])
+{
+    // This helps with mixing output of both wide and narrow characters to the screen
+    std::ios::sync_with_stdio(false);
+
+    rocfft_params params;
+    std::string   lengthArgStr;
+    std::string   precisionStr;
+    int           verbose;
+    int           deviceId;
+    int           ntrial;
+    int           command_type; // 0: tuning , 1: merging
+
+    int transform_type_int;
+    int itype_int;
+    int otype_int;
+
+    std::string base_sol_filename   = "";
+    std::string adding_sol_filename = "";
+    std::string adding_problemkey   = "";
+    std::string output_sol_filename = "";
+
+    // Declare the supported options.
+    // clang-format off
+    options_description opdesc("rocfft rider command line options");
+    opdesc.add_options()("help,h", "produces this help message")
+        ("version,v", "Print queryable version information from the rocfft library")
+        ("command", value<int>(&command_type)->default_value(0), "Action to do:\n0) tuning\n1) merging solution map\n(default: 0)")
+
+        ("base_sol_file", value<std::string>(&base_sol_filename), "filename of base-solution-map")
+        ("new_sol_file", value<std::string>(&adding_sol_filename), "filename of new-solution-map")
+        ("new_probkey", value<std::string>(&adding_problemkey), "problemkey (\"arch:token\") of the solution to be added, (looking up the new-solution-map)")
+        ("output_sol_file", value<std::string>(&output_sol_filename), "filename of merged-solution-map")
+
+        ("device", value<int>(&deviceId)->default_value(0), "Select a specific device id")
+        ("verbose", value<int>(&verbose)->default_value(0), "Control output verbosity")
+        ("ntrial,N", value<int>(&ntrial)->default_value(1), "Trial size for the problem")
+        ("notInPlace,o", "Not in-place FFT transform (default: in-place)")
+        ("precision", value<std::string>(&precisionStr), "Transform precision: single (default), double, half")
+        ("transformType,t", value<int>(&transform_type_int)
+         ->default_value((int)fft_transform_type_complex_forward),
+         "Type of transform:\n0) complex forward\n1) complex inverse\n2) real "
+         "forward\n3) real inverse")
+        ( "batchSize,b", value<size_t>(&params.nbatch)->default_value(1),
+          "If this value is greater than one, arrays will be used ")
+        ( "itype", value<int>(&itype_int)
+          ->default_value((int)fft_array_type_unset),
+          "Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) "
+          "hermitian interleaved\n4) hermitian planar")
+        ( "otype", value<int>(&otype_int)
+          ->default_value((int)fft_array_type_unset),
+          "Array type of output data:\n0) interleaved\n1) planar\n2) real\n3) "
+          "hermitian interleaved\n4) hermitian planar")
+        ("length",  value<std::string>(&lengthArgStr), "Lengths.(Separate by comma)");
+    // clang-format on
+
+    variables_map vm;
+    store(parse_command_line(argc, argv, opdesc), vm);
+    notify(vm);
+
+    //
+    // MERGING COMMAND
+    //
+    if(command_type == command_merging)
+    {
+        if(!vm.count("new_sol_file"))
+        {
+            std::cout << "Please specify file-path of the new solution map" << std::endl;
+            return EXIT_FAILURE;
+        }
+        if(!vm.count("new_probkey"))
+        {
+            std::cout << "Please specify the problem-key to be added" << std::endl;
+            return EXIT_FAILURE;
+        }
+        if(!vm.count("output_sol_file"))
+        {
+            std::cout << "Please specify file-path of the output solution map" << std::endl;
+            return EXIT_FAILURE;
+        }
+
+        return merge_solutions(
+            base_sol_filename, adding_sol_filename, adding_problemkey, output_sol_filename);
+    }
+
+    if(command_type != command_tuning)
+    {
+        std::cout << "Unknown command:" << command_type << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    //
+    // TUNING COMMAND
+    //
+    if(vm.count("precision"))
+    {
+        if(precisionStr == "half")
+            params.precision = fft_precision_half;
+        else if(precisionStr == "single")
+            params.precision = fft_precision_single;
+        else if(precisionStr == "double")
+            params.precision = fft_precision_double;
+        else
+        {
+            std::cout << "Invalid precision: " << precisionStr << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    if(vm.count("transformType"))
+        params.transform_type = (fft_transform_type)transform_type_int;
+    if(vm.count("itype"))
+        params.itype = (fft_array_type)itype_int;
+    if(vm.count("otype"))
+        params.otype = (fft_array_type)otype_int;
+
+    if(vm.count("help"))
+    {
+        std::cout << opdesc << std::endl;
+        return EXIT_SUCCESS;
+    }
+
+    if(vm.count("version"))
+    {
+        char v[256];
+        rocfft_get_version_string(v, 256);
+        std::cout << "version " << v << std::endl;
+        return EXIT_SUCCESS;
+    }
+
+    if(vm.count("ntrial"))
+        std::cout << "Running profile with " << ntrial << " samples\n";
+
+    if(!vm.count("length"))
+    {
+        std::cout << "Please specify transform length!" << std::endl;
+        std::cout << opdesc << std::endl;
+        return EXIT_SUCCESS;
+    }
+    parse_arg_ints(lengthArgStr, params.length);
+    std::cout << "length:";
+    for(auto& i : params.length)
+        std::cout << " " << i;
+    std::cout << "\n";
+
+    params.placement = vm.count("notInPlace") ? fft_placement_notinplace : fft_placement_inplace;
+
+    if(vm.count("notInPlace"))
+        std::cout << "out-of-place\n";
+    else
+        std::cout << "in-place\n";
+
+    if(vm.count("istride"))
+    {
+        std::cout << "istride:";
+        for(auto& i : params.istride)
+            std::cout << " " << i;
+        std::cout << "\n";
+    }
+    if(vm.count("ostride"))
+    {
+        std::cout << "ostride:";
+        for(auto& i : params.ostride)
+            std::cout << " " << i;
+        std::cout << "\n";
+    }
+
+    if(params.idist > 0)
+        std::cout << "idist: " << params.idist << "\n";
+    if(params.odist > 0)
+        std::cout << "odist: " << params.odist << "\n";
+
+    if(vm.count("ioffset"))
+    {
+        std::cout << "ioffset:";
+        for(auto& i : params.ioffset)
+            std::cout << " " << i;
+        std::cout << "\n";
+    }
+    if(vm.count("ooffset"))
+    {
+        std::cout << "ooffset:";
+        for(auto& i : params.ooffset)
+            std::cout << " " << i;
+        std::cout << "\n";
+    }
+
+    std::cout << std::flush;
+
+    return offline_tune_problems(params, verbose, ntrial);
+}
diff -Nru rocfft-5.5.0/library/src/rocfft_ostream.cpp rocfft-5.7.1/library/src/rocfft_ostream.cpp
--- rocfft-5.5.0/library/src/rocfft_ostream.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/rocfft_ostream.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -429,6 +429,9 @@
 {
     switch(precision)
     {
+    case rocfft_precision_half:
+        os << "half";
+        break;
     case rocfft_precision_single:
         os << "single";
         break;
diff -Nru rocfft-5.5.0/library/src/rocfft_solmap_convert.cpp rocfft-5.7.1/library/src/rocfft_solmap_convert.cpp
--- rocfft-5.5.0/library/src/rocfft_solmap_convert.cpp	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/rocfft_solmap_convert.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,77 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include <cmath>
+#include <cstddef>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "../../shared/environment.h"
+#include "option_util.h"
+#include "rocfft.h"
+#include "solution_map.h"
+
+int main(int argc, char* argv[])
+{
+    // This helps with mixing output of both wide and narrow characters to the screen
+    std::ios::sync_with_stdio(false);
+
+    std::string input_filename  = "";
+    std::string output_filename = "";
+
+    // Declare the supported options.
+    // clang-format off
+    options_description opdesc("rocfft solution map converter command line options");
+    opdesc.add_options()("help,h", "produces this help message")
+        ("input_file", value<std::string>(&input_filename), "filename of base-solution-map")
+        ("output_file", value<std::string>(&output_filename), "filename of new-solution-map");
+    // clang-format on
+
+    variables_map vm;
+    store(parse_command_line(argc, argv, opdesc), vm);
+    notify(vm);
+
+    if(!vm.count("input_file"))
+    {
+        std::cout << "Please specify file-path of the target solution map" << std::endl;
+        return EXIT_FAILURE;
+    }
+    if(!vm.count("output_file"))
+    {
+        std::cout << "Please specify file-path of the output solution map" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // don't use anything from solutions.cpp
+    rocfft_setenv("ROCFFT_USE_EMPTY_SOL_MAP", "1");
+
+    SolutionMapConverter converter;
+    bool check_result = converter.VersionCheckAndConvert(input_filename, output_filename);
+
+    if(!check_result)
+    {
+        std::cout << "Converting Solution Map Failed" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    return EXIT_SUCCESS;
+}
\ No newline at end of file
diff -Nru rocfft-5.5.0/library/src/rocfft_stub.cpp rocfft-5.7.1/library/src/rocfft_stub.cpp
--- rocfft-5.5.0/library/src/rocfft_stub.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/rocfft_stub.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -28,7 +28,9 @@
 int log_plan_fd     = -1;
 int log_kernelio_fd = -1;
 int log_rtc_fd      = -1;
+int log_tuning_fd   = -1;
 
+#ifndef ROCFFT_BUILD_OFFLINE_TUNER
 extern "C" rocfft_status rocfft_plan_create(rocfft_plan*                  plan,
                                             rocfft_result_placement       placement,
                                             rocfft_transform_type         transform_type,
@@ -40,3 +42,4 @@
 {
     return rocfft_status_failure;
 }
+#endif
\ No newline at end of file
diff -Nru rocfft-5.5.0/library/src/rtc_bluestein_gen.cpp rocfft-5.7.1/library/src/rtc_bluestein_gen.cpp
--- rocfft-5.5.0/library/src/rtc_bluestein_gen.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/rtc_bluestein_gen.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -71,6 +71,7 @@
     std::string src;
 
     // includes and declarations
+    src += rocfft_complex_h;
     src += common_h;
     src += callback_h;
 
@@ -188,7 +189,7 @@
     Variable val{"val", "scalar_type"};
 
     func.body += Declaration{tx, "threadIdx.x + blockIdx.x * blockDim.x"};
-    func.body += Declaration{val, CallExpr{"lib_make_vector2<scalar_type>", {0, 0}}};
+    func.body += Declaration{val, CallExpr{"scalar_type", {Literal{"0.0"}, Literal{"0.0"}}}};
 
     func.body
         += If{twl == 1, {Assign{val, CallExpr{"TWLstep1", {twiddles_large, (tx * tx) % (2 * N)}}}}};
@@ -199,7 +200,7 @@
     func.body += ElseIf{twl == 4,
                         {Assign{val, CallExpr{"TWLstep4", {twiddles_large, (tx * tx) % (2 * N)}}}}};
 
-    func.body += MultiplyAssign(val.y, CallExpr{"real_type_t<scalar_type>", {dir}});
+    func.body += MultiplyAssign(val.y(), CallExpr{"real_type_t<scalar_type>", {dir}});
 
     func.body += If{tx == 0,
                     {
@@ -212,10 +213,10 @@
 
                          Assign{output[M - tx], val},
                          Assign{output[M - tx + M], val}}};
-    func.body
-        += ElseIf{tx <= (M - N),
-                  {Assign{output[tx], CallExpr{"lib_make_vector2<scalar_type>", {0, 0}}},
-                   Assign{output[tx + M], CallExpr{"lib_make_vector2<scalar_type>", {0, 0}}}}};
+    func.body += ElseIf{
+        tx <= (M - N),
+        {Assign{output[tx], CallExpr{"scalar_type", {Literal{"0.0"}, Literal{"0.0"}}}},
+         Assign{output[tx + M], CallExpr{"scalar_type", {Literal{"0.0"}, Literal{"0.0"}}}}}};
 
     return func.render();
 }
@@ -224,6 +225,7 @@
 {
     std::string src;
     // includes and declarations
+    src += rocfft_complex_h;
     src += common_h;
     src += callback_h;
 
@@ -328,12 +330,13 @@
         If       readBlock{tx < N, {}};
         readBlock.body += Declaration{in_elem};
         readBlock.body += Assign{in_elem, LoadGlobal{input, iIdx}};
-        readBlock.body += Assign{output[oIdx].x, in_elem.x * chirp[tx].x + in_elem.y * chirp[tx].y};
         readBlock.body
-            += Assign{output[oIdx].y, -in_elem.x * chirp[tx].y + in_elem.y * chirp[tx].x};
+            += Assign{output[oIdx].x(), in_elem.x() * chirp[tx].x() + in_elem.y() * chirp[tx].y()};
+        readBlock.body
+            += Assign{output[oIdx].y(), -in_elem.x() * chirp[tx].y() + in_elem.y() * chirp[tx].x()};
         func.body += readBlock;
-        func.body
-            += Else{{Assign{output[oIdx], CallExpr{"lib_make_vector2<scalar_type>", {0, 0}}}}};
+        func.body += Else{
+            {Assign{output[oIdx], CallExpr{"scalar_type", {Literal{"0.0"}, Literal{"0.0"}}}}}};
         break;
     }
     case CS_KERNEL_FFT_MUL:
@@ -343,10 +346,10 @@
                                   "don't need to run callbacks."};
         func.body += AddAssign(output, oOffset);
         func.body += Declaration{out_elem, output[oIdx]};
-        func.body
-            += Assign{output[oIdx].x, input[iIdx].x * out_elem.x - input[iIdx].y * out_elem.y};
-        func.body
-            += Assign{output[oIdx].y, input[iIdx].x * out_elem.y + input[iIdx].y * out_elem.x};
+        func.body += Assign{output[oIdx].x(),
+                            input[iIdx].x() * out_elem.x() - input[iIdx].y() * out_elem.y()};
+        func.body += Assign{output[oIdx].y(),
+                            input[iIdx].x() * out_elem.y() + input[iIdx].y() * out_elem.x()};
         break;
     }
     case CS_KERNEL_RES_MUL:
@@ -362,10 +365,11 @@
         Variable MI{"MI", "real_type_t<scalar_type>"};
         func.body += Declaration{MI, Literal{"1.0"} / CallExpr{"real_type_t<scalar_type>", {M}}};
         func.body += Declaration{out_elem};
+        func.body += Assign{
+            out_elem.x(), MI * (input[iIdx].x() * chirp[tx].x() + input[iIdx].y() * chirp[tx].y())};
         func.body
-            += Assign{out_elem.x, MI * (input[iIdx].x * chirp[tx].x + input[iIdx].y * chirp[tx].y)};
-        func.body += Assign{out_elem.y,
-                            MI * (-input[iIdx].x * chirp[tx].y + input[iIdx].y * chirp[tx].x)};
+            += Assign{out_elem.y(),
+                      MI * (-input[iIdx].x() * chirp[tx].y() + input[iIdx].y() * chirp[tx].x())};
         if(specs.enable_scaling)
             func.body += MultiplyAssign(out_elem, scale_factor);
         func.body += StoreGlobal{output, oIdx, out_elem};
diff -Nru rocfft-5.5.0/library/src/rtc_bluestein_kernel.cpp rocfft-5.7.1/library/src/rtc_bluestein_kernel.cpp
--- rocfft-5.5.0/library/src/rtc_bluestein_kernel.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/rtc_bluestein_kernel.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -21,6 +21,7 @@
 #include "rtc_bluestein_kernel.h"
 #include "../../shared/arithmetic.h"
 #include "../../shared/array_predicate.h"
+#include "../../shared/precision_type.h"
 #include "function_pool.h"
 #include "kernel_launch.h"
 #include "rtc_bluestein_gen.h"
@@ -106,10 +107,18 @@
         if(array_type_is_planar(data.node->outArrayType))
             kargs.append_ptr(data.bufOut[1]);
     }
-    if(data.node->precision == rocfft_precision_single)
+    switch(data.node->precision)
+    {
+    case rocfft_precision_half:
+        kargs.append_half(data.node->scale_factor);
+        break;
+    case rocfft_precision_single:
         kargs.append_float(data.node->scale_factor);
-    else
+        break;
+    case rocfft_precision_double:
         kargs.append_double(data.node->scale_factor);
+        break;
+    }
 
     // callback params
     kargs.append_ptr(data.callbacks.load_cb_fn);
@@ -220,15 +229,7 @@
     }
     else
     {
-        size_t cBytes;
-        if(data.node->precision == rocfft_precision_single)
-        {
-            cBytes = sizeof(float) * 2;
-        }
-        else
-        {
-            cBytes = sizeof(double) * 2;
-        }
+        const size_t cBytes = complex_type_size(data.node->precision);
 
         void* bufIn0  = data.bufIn[0];
         void* bufOut0 = data.bufOut[0];
@@ -266,10 +267,18 @@
         kargs.append_ptr(data.callbacks.store_cb_fn);
         kargs.append_ptr(data.callbacks.store_cb_data);
 
-        if(data.node->precision == rocfft_precision_single)
+        switch(data.node->precision)
+        {
+        case rocfft_precision_half:
+            kargs.append_half(data.node->scale_factor);
+            break;
+        case rocfft_precision_single:
             kargs.append_float(data.node->scale_factor);
-        else
+            break;
+        case rocfft_precision_double:
             kargs.append_double(data.node->scale_factor);
+            break;
+        }
     }
     return kargs;
 }
diff -Nru rocfft-5.5.0/library/src/rtc_cache.cpp rocfft-5.7.1/library/src/rtc_cache.cpp
--- rocfft-5.5.0/library/src/rtc_cache.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/rtc_cache.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -44,21 +44,29 @@
 // in-process instead of making everything go to subprocess.
 static std::mutex compile_lock;
 
-// Get path to system RTC cache - returns empty if no suitable path
-// can be found
-static fs::path rtccache_db_sys_path()
+// Get paths to system RTC cache, in decreasing order of preference.
+static std::vector<fs::path> rtccache_db_sys_paths()
 {
     // if env var is set, use that directly
-    auto env_path = rocfft_getenv("ROCFFT_RTC_SYS_CACHE_PATH");
+    std::vector<fs::path> paths;
+    auto                  env_path = rocfft_getenv("ROCFFT_RTC_SYS_CACHE_PATH");
+
     if(!env_path.empty())
-        return env_path;
-    auto lib_path = get_library_path();
-    if(!lib_path.empty())
     {
-        fs::path library_parent_path = lib_path.parent_path();
-        return library_parent_path / default_cache_filename;
+        paths.push_back(env_path);
     }
-    return {};
+    else
+    {
+        auto lib_path = get_library_path();
+        if(!lib_path.empty())
+        {
+            // try next to the library, and in rocfft subdir
+            fs::path library_parent_path = lib_path.parent_path();
+            paths.push_back(library_parent_path / default_cache_filename);
+            paths.push_back(library_parent_path / "rocfft" / default_cache_filename);
+        }
+    }
+    return paths;
 }
 
 // Get list of candidate paths to RTC user cache DB, in decreasing
@@ -132,9 +140,13 @@
 
 RTCCache::RTCCache()
 {
-    auto sys_path = rtccache_db_sys_path();
-    if(!sys_path.empty())
-        db_sys = connect_db(sys_path, true);
+    auto sys_paths = rtccache_db_sys_paths();
+    for(const auto& p : sys_paths)
+    {
+        db_sys = connect_db(p, true);
+        if(db_sys)
+            break;
+    }
 
     auto paths = rtccache_db_user_paths();
     for(const auto& p : paths)
diff -Nru rocfft-5.5.0/library/src/rtc_chirp_gen.cpp rocfft-5.7.1/library/src/rtc_chirp_gen.cpp
--- rocfft-5.5.0/library/src/rtc_chirp_gen.cpp	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/rtc_chirp_gen.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,99 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "rtc_chirp_gen.h"
+#include "device/kernel-generator-embed.h"
+#include "rtc_kernel.h"
+
+std::string chirp_rtc_kernel_name(rocfft_precision precision)
+{
+    std::string kernel_name = "chirp_gen";
+    kernel_name += rtc_precision_name(precision);
+    return kernel_name;
+}
+
+const char* chirp_rtc_header = "extern \"C\" __global__ void ";
+
+static std::string chirp_rtc_launch_bounds()
+{
+    std::string bounds = "__launch_bounds__(";
+    bounds += std::to_string(CHIRP_THREADS);
+    bounds += ") ";
+    return bounds;
+}
+
+static std::string chirp_rtc_args()
+{
+    std::string args = "(";
+    args += "size_t N";
+    args += ", scalar_type* output";
+    args += ")";
+    return args;
+}
+
+static std::string chirp_rtc_body()
+{
+    std::string body = "{";
+    body += R"_SRC(
+        auto i = threadIdx.x + blockIdx.x * blockDim.x;
+
+        if(i < N)
+        {
+            unsigned int twoN = 2 * N;
+            unsigned int iSq  = i * i;
+
+            auto f = (double)iSq / (double)twoN;
+
+            unsigned int fRnd = floor(f);
+
+            auto aLow = iSq;
+            auto bLow = twoN * fRnd;
+
+            auto aHi = __umulhi(i, i);
+            auto bHi = __umulhi(twoN, fRnd);
+
+            auto f1 = (aHi - bHi) * (double)(0x100000000 % twoN) / (double)twoN;
+            auto f2 = (double)((aLow - bLow) % twoN) / (double)twoN;
+            auto fp = (f1 - floor(f1)) + f2;
+
+            output[i].x = cos(TWO_PI * fp);
+            output[i].y = sin(TWO_PI * fp);
+        }
+        )_SRC";
+    body += "}";
+    return body;
+}
+
+std::string chirp_rtc(const std::string& kernel_name, rocfft_precision precision)
+{
+    std::string src;
+
+    src += rocfft_complex_h;
+    src += common_h;
+    src += rtc_precision_type_decl(precision);
+    src += "static constexpr double TWO_PI = 6.283185307179586476925286766559;\n";
+
+    src += chirp_rtc_header;
+    src += chirp_rtc_launch_bounds();
+    src += kernel_name;
+    src += chirp_rtc_args();
+    src += chirp_rtc_body();
+    return src;
+}
diff -Nru rocfft-5.5.0/library/src/rtc_chirp_kernel.cpp rocfft-5.7.1/library/src/rtc_chirp_kernel.cpp
--- rocfft-5.5.0/library/src/rtc_chirp_kernel.cpp	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/rtc_chirp_kernel.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,35 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "rtc_chirp_kernel.h"
+#include "device/kernel-generator-embed.h"
+#include "rtc_cache.h"
+
+RTCKernelChirp RTCKernelChirp::generate(const std::string& gpu_arch, rocfft_precision precision)
+{
+    auto kernel_name = chirp_rtc_kernel_name(precision);
+
+    kernel_src_gen_t generator{
+        [=](const std::string& kernel_name) { return chirp_rtc(kernel_name, precision); }};
+
+    auto code = cached_compile(kernel_name, gpu_arch, generator, generator_sum());
+
+    return RTCKernelChirp{kernel_name, code, {}, {}};
+}
diff -Nru rocfft-5.5.0/library/src/rtc_kernel.cpp rocfft-5.7.1/library/src/rtc_kernel.cpp
--- rocfft-5.5.0/library/src/rtc_kernel.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/rtc_kernel.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -68,6 +68,9 @@
         case rocfft_precision_single:
             kargs.append_float(data.node->scale_factor);
             break;
+        case rocfft_precision_half:
+            kargs.append_half(data.node->scale_factor);
+            break;
         }
     }
 
@@ -117,8 +120,19 @@
         throw std::runtime_error("hipModuleLaunchKernel failure");
 }
 
-std::shared_future<std::unique_ptr<RTCKernel>> RTCKernel::runtime_compile(
-    const TreeNode& node, const std::string& gpu_arch, bool enable_callbacks)
+bool RTCKernel::get_occupancy(dim3 blockDim, unsigned int lds_bytes, int& occupancy)
+{
+    hipError_t ret = hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(
+        &occupancy, kernel, blockDim.x * blockDim.y * blockDim.z, lds_bytes);
+
+    return ret == hipSuccess;
+}
+
+std::shared_future<std::unique_ptr<RTCKernel>>
+    RTCKernel::runtime_compile(const TreeNode&    node,
+                               const std::string& gpu_arch,
+                               std::string&       kernel_name,
+                               bool               enable_callbacks)
 {
 
 #ifdef ROCFFT_RUNTIME_COMPILE
@@ -149,7 +163,7 @@
         generator = RTCKernelApplyCallback::generate_from_node(node, gpu_arch, enable_callbacks);
     if(generator.valid())
     {
-        std::string kernel_name = generator.generate_name();
+        kernel_name = generator.generate_name();
 
         auto compile = [=]() {
             if(hipSetDevice(deviceId) != hipSuccess)
@@ -174,6 +188,11 @@
         // compile to code object
         return std::async(std::launch::async, compile);
     }
+    // a pre-compiled rtc-stockham-kernel goes here
+    else if(generator.is_pre_compiled())
+    {
+        kernel_name = generator.generate_name();
+    }
 #endif
     // runtime compilation is not enabled or no kernel found, return
     // null RTCKernel
diff -Nru rocfft-5.5.0/library/src/rtc_realcomplex_gen.cpp rocfft-5.7.1/library/src/rtc_realcomplex_gen.cpp
--- rocfft-5.5.0/library/src/rtc_realcomplex_gen.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/rtc_realcomplex_gen.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -182,7 +182,7 @@
         write_conj.body += Declaration{elem};
         write_conj.body += Assign{elem, LoadGlobal{input, input_offset}};
         write_conj.body += Assign{outputs[0], elem};
-        write_conj.body += Assign{elem.y, UnaryMinus{elem.y}};
+        write_conj.body += Assign{elem.y(), UnaryMinus{elem.y()}};
         write_conj.body += Assign{outputc[0], elem};
         func.body += write_conj;
     }
@@ -269,7 +269,7 @@
             guard.body += CallbackDeclaration("real_type_t<scalar_type>", "cbtype");
 
             Variable elem{"elem", "auto"};
-            guard.body += Declaration{elem, input[inputIdx].x};
+            guard.body += Declaration{elem, input[inputIdx].x()};
             if(specs.enable_scaling)
                 guard.body += MultiplyAssign(elem, scale_factor_var);
             guard.body += StoreGlobal{output, outputIdx, elem};
@@ -291,6 +291,8 @@
 {
     std::string src;
     // includes and declarations
+
+    src += rocfft_complex_h;
     src += common_h;
     src += callback_h;
 
@@ -351,6 +353,8 @@
 {
     std::string src;
     // includes and declarations
+
+    src += rocfft_complex_h;
     src += common_h;
     src += callback_h;
 
@@ -455,15 +459,15 @@
     if(specs.scheme == CS_KERNEL_R_TO_CMPLX)
     {
         if_idx_p_zero.body
-            += Assign{outval.x, input[input_offset + 0].x - input[input_offset + 0].y};
-        if_idx_p_zero.body += Assign{outval.y, 0};
+            += Assign{outval.x(), input[input_offset + 0].x() - input[input_offset + 0].y()};
+        if_idx_p_zero.body += Assign{outval.y(), 0};
         if(specs.enable_scaling)
             if_idx_p_zero.body += MultiplyAssign(outval, scale_factor);
         if_idx_p_zero.body += StoreGlobal{output, output_offset + half_N, outval};
 
         if_idx_p_zero.body
-            += Assign{outval.x, input[input_offset + 0].x + input[input_offset + 0].y};
-        if_idx_p_zero.body += Assign{outval.y, 0};
+            += Assign{outval.x(), input[input_offset + 0].x() + input[input_offset + 0].y()};
+        if_idx_p_zero.body += Assign{outval.y(), 0};
         if(specs.enable_scaling)
             if_idx_p_zero.body += MultiplyAssign(outval, scale_factor);
         if_idx_p_zero.body += StoreGlobal{output, output_offset + 0, outval};
@@ -474,15 +478,15 @@
         if_idx_p_zero.body += Assign{p, LoadGlobal{input, input_offset + idx_p}};
         if_idx_p_zero.body += Declaration{q};
         if_idx_p_zero.body += Assign{q, LoadGlobal{input, input_offset + idx_q}};
-        if_idx_p_zero.body += Assign{output[output_offset + idx_p].x, p.x + q.x};
-        if_idx_p_zero.body += Assign{output[output_offset + idx_p].y, p.x - q.x};
+        if_idx_p_zero.body += Assign{output[output_offset + idx_p].x(), p.x() + q.x()};
+        if_idx_p_zero.body += Assign{output[output_offset + idx_p].y(), p.x() - q.x()};
     }
 
     If if_Ndiv4{"Ndiv4", {}};
     if(specs.scheme == CS_KERNEL_R_TO_CMPLX)
     {
-        if_Ndiv4.body += Assign{outval.x, input[input_offset + quarter_N].x};
-        if_Ndiv4.body += Assign{outval.y, -input[input_offset + quarter_N].y};
+        if_Ndiv4.body += Assign{outval.x(), input[input_offset + quarter_N].x()};
+        if_Ndiv4.body += Assign{outval.y(), -input[input_offset + quarter_N].y()};
         if(specs.enable_scaling)
             if_Ndiv4.body += MultiplyAssign(outval, scale_factor);
         if_Ndiv4.body += StoreGlobal{output, output_offset + quarter_N, outval};
@@ -493,9 +497,9 @@
         if_Ndiv4.body += Declaration{quarter_elem};
         if_Ndiv4.body += Assign{quarter_elem, LoadGlobal{input, input_offset + quarter_N}};
         if_Ndiv4.body
-            += Assign{output[output_offset + quarter_N].x, Literal{"2.0"} * quarter_elem.x};
+            += Assign{output[output_offset + quarter_N].x(), Literal{"2.0"} * quarter_elem.x()};
         if_Ndiv4.body
-            += Assign{output[output_offset + quarter_N].y, Literal{"-2.0"} * quarter_elem.y};
+            += Assign{output[output_offset + quarter_N].y(), Literal{"-2.0"} * quarter_elem.y()};
     }
 
     if_idx_p_zero.body += if_Ndiv4;
@@ -514,14 +518,18 @@
         else_idx_p_nonzero.body += Declaration{twd_p, twiddles[idx_p]};
         else_idx_p_nonzero.body += CommentLines{"NB: twd_q = -conj(twd_p) = (-twd_p.x, twd_p.y);"};
 
-        else_idx_p_nonzero.body += Assign{outval.x, u.x + v.x * twd_p.y + u.y * twd_p.x};
-        else_idx_p_nonzero.body += Assign{outval.y, v.y + u.y * twd_p.y - v.x * twd_p.x};
+        else_idx_p_nonzero.body
+            += Assign{outval.x(), u.x() + v.x() * twd_p.y() + u.y() * twd_p.x()};
+        else_idx_p_nonzero.body
+            += Assign{outval.y(), v.y() + u.y() * twd_p.y() - v.x() * twd_p.x()};
         if(specs.enable_scaling)
             else_idx_p_nonzero.body += MultiplyAssign(outval, scale_factor);
         else_idx_p_nonzero.body += StoreGlobal{output, output_offset + idx_p, outval};
 
-        else_idx_p_nonzero.body += Assign{outval.x, u.x - v.x * twd_p.y - u.y * twd_p.x};
-        else_idx_p_nonzero.body += Assign{outval.y, -v.y + u.y * twd_p.y - v.x * twd_p.x};
+        else_idx_p_nonzero.body
+            += Assign{outval.x(), u.x() - v.x() * twd_p.y() - u.y() * twd_p.x()};
+        else_idx_p_nonzero.body
+            += Assign{outval.y(), -v.y() + u.y() * twd_p.y() - v.x() * twd_p.x()};
         if(specs.enable_scaling)
             else_idx_p_nonzero.body += MultiplyAssign(outval, scale_factor);
         else_idx_p_nonzero.body += StoreGlobal{output, output_offset + idx_q, outval};
@@ -538,15 +546,15 @@
         else_idx_p_nonzero.body += Declaration{twd_p, twiddles[idx_p]};
         else_idx_p_nonzero.body += CommentLines{"NB: twd_q = -conj(twd_p);"};
 
-        else_idx_p_nonzero.body
-            += Assign{output[output_offset + idx_p].x, u.x + v.x * twd_p.y - u.y * twd_p.x};
-        else_idx_p_nonzero.body
-            += Assign{output[output_offset + idx_p].y, v.y + u.y * twd_p.y + v.x * twd_p.x};
-
-        else_idx_p_nonzero.body
-            += Assign{output[output_offset + idx_q].x, u.x - v.x * twd_p.y + u.y * twd_p.x};
-        else_idx_p_nonzero.body
-            += Assign{output[output_offset + idx_q].y, -v.y + u.y * twd_p.y + v.x * twd_p.x};
+        else_idx_p_nonzero.body += Assign{output[output_offset + idx_p].x(),
+                                          u.x() + v.x() * twd_p.y() - u.y() * twd_p.x()};
+        else_idx_p_nonzero.body += Assign{output[output_offset + idx_p].y(),
+                                          v.y() + u.y() * twd_p.y() + v.x() * twd_p.x()};
+
+        else_idx_p_nonzero.body += Assign{output[output_offset + idx_q].x(),
+                                          u.x() - v.x() * twd_p.y() + u.y() * twd_p.x()};
+        else_idx_p_nonzero.body += Assign{output[output_offset + idx_q].y(),
+                                          -v.y() + u.y() * twd_p.y() + v.x() * twd_p.x()};
     }
 
     guard.body += else_idx_p_nonzero;
@@ -601,6 +609,8 @@
 
     std::string src;
     // includes and declarations
+
+    src += rocfft_complex_h;
     src += common_h;
     src += callback_h;
 
@@ -801,18 +811,18 @@
         write_condition = Literal{"blockIdx.x"} == 0 && Literal{"threadIdx.x"} == 0
                           && row_start + lds_row < row_end;
 
-        compute_first_val += Assign{val.x, first_elem.x - first_elem.y};
-        compute_first_val += Assign{val.y, Literal{"0.0"}};
+        compute_first_val += Assign{val.x(), first_elem.x() - first_elem.y()};
+        compute_first_val += Assign{val.y(), Literal{"0.0"}};
         write_first_idx = CallExpr{"output_row_base", {dim, output_batch_start, outStride, len_row}}
                           + row_start + lds_row;
 
-        compute_middle_val += Assign{val.x, middle_elem.x};
-        compute_middle_val += Assign{val.y, -middle_elem.y};
+        compute_middle_val += Assign{val.x(), middle_elem.x()};
+        compute_middle_val += Assign{val.y(), -middle_elem.y()};
         write_middle_idx = CallExpr{"output_row_base", {dim, output_batch_start, outStride, middle}}
                            + row_start + lds_row;
 
-        compute_last_val += Assign{val.x, first_elem.x + first_elem.y};
-        compute_last_val += Assign{val.y, Literal{"0.0"}};
+        compute_last_val += Assign{val.x(), first_elem.x() + first_elem.y()};
+        compute_last_val += Assign{val.y(), Literal{"0.0"}};
         write_last_idx = CallExpr{"output_row_base", {dim, output_batch_start, outStride, 0}}
                          + row_start + lds_row;
     }
@@ -842,12 +852,12 @@
         write_condition = Literal{"blockIdx.y"} == 0 && Literal{"threadIdx.y"} == 0
                           && row_start + lds_col < row_end;
 
-        compute_first_val += Assign{val.x, first_elem.x + last_elem.x};
-        compute_first_val += Assign{val.y, first_elem.x - last_elem.x};
+        compute_first_val += Assign{val.x(), first_elem.x() + last_elem.x()};
+        compute_first_val += Assign{val.y(), first_elem.x() - last_elem.x()};
         write_first_idx = output_batch_start + output_row_base;
 
-        compute_middle_val += Assign{val.x, Literal{"2.0"} * middle_elem.x};
-        compute_middle_val += Assign{val.y, Literal{"-2.0"} * middle_elem.y};
+        compute_middle_val += Assign{val.x(), Literal{"2.0"} * middle_elem.x()};
+        compute_middle_val += Assign{val.y(), Literal{"-2.0"} * middle_elem.y()};
         write_middle_idx = output_batch_start + output_row_base + middle * output_row_stride;
     }
 
@@ -924,8 +934,8 @@
         butterfly.body += CommentLines{"NB: twd_q = -conj(twd_p) = (-twd_p.x, twd_p.y)"};
 
         butterfly.body += CommentLines{"write left side"};
-        butterfly.body += Assign{val.x, u.x + v.x * twd_p.y + u.y * twd_p.x};
-        butterfly.body += Assign{val.y, v.y + u.y * twd_p.y - v.x * twd_p.x};
+        butterfly.body += Assign{val.x(), u.x() + v.x() * twd_p.y() + u.y() * twd_p.x()};
+        butterfly.body += Assign{val.y(), v.y() + u.y() * twd_p.y() - v.x() * twd_p.x()};
         butterfly.body
             += StoreGlobal{output,
                            CallExpr{"output_row_base", {dim, output_batch_start, outStride, col}}
@@ -933,8 +943,8 @@
                            val};
 
         butterfly.body += CommentLines{"write right side"};
-        butterfly.body += Assign{val.x, u.x - v.x * twd_p.y - u.y * twd_p.x};
-        butterfly.body += Assign{val.y, -v.y + u.y * twd_p.y - v.x * twd_p.x};
+        butterfly.body += Assign{val.x(), u.x() - v.x() * twd_p.y() - u.y() * twd_p.x()};
+        butterfly.body += Assign{val.y(), -v.y() + u.y() * twd_p.y() - v.x() * twd_p.x()};
         butterfly.body += StoreGlobal{
             output,
             CallExpr{"output_row_base", {dim, output_batch_start, outStride, len_row - col}}
@@ -955,16 +965,16 @@
         butterfly.body += Declaration{twd_p, twiddles[left_col_start + lds_row]};
 
         butterfly.body += CommentLines{"write top side"};
-        butterfly.body += Assign{val.x, u.x + v.x * twd_p.y - u.y * twd_p.x};
-        butterfly.body += Assign{val.y, v.y + u.y * twd_p.y + v.x * twd_p.x};
+        butterfly.body += Assign{val.x(), u.x() + v.x() * twd_p.y() - u.y() * twd_p.x()};
+        butterfly.body += Assign{val.y(), v.y() + u.y() * twd_p.y() + v.x() * twd_p.x()};
         butterfly.body += StoreGlobal{output,
                                       output_batch_start + output_row_base
                                           + (left_col_start + lds_row) * output_row_stride,
                                       val};
 
         butterfly.body += CommentLines{"write bottom side"};
-        butterfly.body += Assign{val.x, u.x - v.x * twd_p.y + u.y * twd_p.x};
-        butterfly.body += Assign{val.y, -v.y + u.y * twd_p.y + v.x * twd_p.x};
+        butterfly.body += Assign{val.x(), u.x() - v.x() * twd_p.y() + u.y() * twd_p.x()};
+        butterfly.body += Assign{val.y(), -v.y() + u.y() * twd_p.y() + v.x() * twd_p.x()};
         butterfly.body
             += StoreGlobal{output,
                            output_batch_start + output_row_base
@@ -994,6 +1004,8 @@
     std::string src;
 
     // includes and declarations
+
+    src += rocfft_complex_h;
     src += common_h;
     src += callback_h;
 
diff -Nru rocfft-5.5.0/library/src/rtc_realcomplex_kernel.cpp rocfft-5.7.1/library/src/rtc_realcomplex_kernel.cpp
--- rocfft-5.5.0/library/src/rtc_realcomplex_kernel.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/rtc_realcomplex_kernel.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -124,10 +124,18 @@
     if(data.node->scheme == CS_KERNEL_COPY_CMPLX_TO_HERM
        || data.node->scheme == CS_KERNEL_COPY_CMPLX_TO_R)
     {
-        if(data.node->precision == rocfft_precision_single)
+        switch(data.node->precision)
+        {
+        case rocfft_precision_half:
+            kargs.append_half(data.node->scale_factor);
+            break;
+        case rocfft_precision_single:
             kargs.append_float(data.node->scale_factor);
-        else
+            break;
+        case rocfft_precision_double:
             kargs.append_double(data.node->scale_factor);
+            break;
+        }
     }
 
     return kargs;
@@ -221,10 +229,18 @@
     kargs.append_ptr(data.callbacks.store_cb_data);
     if(data.node->IsScalingEnabled())
     {
-        if(data.node->precision == rocfft_precision_single)
+        switch(data.node->precision)
+        {
+        case rocfft_precision_half:
+            kargs.append_half(data.node->scale_factor);
+            break;
+        case rocfft_precision_single:
             kargs.append_float(data.node->scale_factor);
-        else
+            break;
+        case rocfft_precision_double:
             kargs.append_double(data.node->scale_factor);
+            break;
+        }
     }
 
     return kargs;
diff -Nru rocfft-5.5.0/library/src/rtc_stockham_gen.cpp rocfft-5.7.1/library/src/rtc_stockham_gen.cpp
--- rocfft-5.5.0/library/src/rtc_stockham_gen.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/rtc_stockham_gen.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -25,6 +25,7 @@
 
 using namespace std::placeholders;
 
+#include "device/generator/bluestein_generator.h"
 #include "device/generator/generator.h"
 #include "device/generator/stockham_gen.h"
 #include "device/generator/stockham_gen_base.h"
@@ -39,25 +40,25 @@
 #include "device/kernel-generator-embed.h"
 
 // generate name for RTC stockham kernel
-std::string stockham_rtc_kernel_name(ComputeScheme           scheme,
-                                     size_t                  length1D,
-                                     size_t                  length2D,
-                                     size_t                  static_dim,
-                                     int                     direction,
-                                     rocfft_precision        precision,
-                                     rocfft_result_placement placement,
-                                     rocfft_array_type       inArrayType,
-                                     rocfft_array_type       outArrayType,
-                                     bool                    unitstride,
-                                     size_t                  largeTwdBase,
-                                     size_t                  largeTwdSteps,
-                                     bool                    largeTwdBatchIsTransformCount,
-                                     EmbeddedType            ebtype,
-                                     DirectRegType           dir2regMode,
-                                     IntrinsicAccessType     intrinsicMode,
-                                     SBRC_TRANSPOSE_TYPE     transpose_type,
-                                     bool                    enable_callbacks,
-                                     bool                    enable_scaling)
+std::string stockham_rtc_kernel_name(const StockhamGeneratorSpecs& specs,
+                                     const StockhamGeneratorSpecs& specs2d,
+                                     ComputeScheme                 scheme,
+                                     int                           direction,
+                                     rocfft_precision              precision,
+                                     rocfft_result_placement       placement,
+                                     rocfft_array_type             inArrayType,
+                                     rocfft_array_type             outArrayType,
+                                     bool                          unitstride,
+                                     size_t                        largeTwdBase,
+                                     size_t                        largeTwdSteps,
+                                     bool                          largeTwdBatchIsTransformCount,
+                                     EmbeddedType                  ebtype,
+                                     DirectRegType                 dir2regMode,
+                                     IntrinsicAccessType           intrinsicMode,
+                                     SBRC_TRANSPOSE_TYPE           transpose_type,
+                                     bool                          enable_callbacks,
+                                     bool                          enable_scaling,
+                                     BluesteinFuseType             fuseBlue)
 {
     std::string kernel_name = "fft_rtc";
 
@@ -67,14 +68,40 @@
         kernel_name += "_back";
 
     kernel_name += "_len";
-    kernel_name += std::to_string(length1D);
-    if(length2D)
-        kernel_name += "x" + std::to_string(length2D);
+    kernel_name += std::to_string(specs.length);
+    if(scheme == CS_KERNEL_2D_SINGLE)
+        kernel_name += "x" + std::to_string(specs2d.length);
+
+    // need to save the kernel configurations in name,
+    kernel_name += "_factors";
+    for(auto f : specs.factors)
+    {
+        kernel_name += "_";
+        kernel_name += std::to_string(f);
+    }
+    if(scheme == CS_KERNEL_2D_SINGLE)
+    {
+        kernel_name += "_x";
+        for(auto f : specs2d.factors)
+        {
+            kernel_name += "_";
+            kernel_name += std::to_string(f);
+        }
+    }
+    kernel_name += "_wgs_";
+    kernel_name += std::to_string(specs.workgroup_size);
+    kernel_name += "_tpt_";
+    kernel_name += std::to_string(specs.threads_per_transform);
+    if(scheme == CS_KERNEL_2D_SINGLE)
+        kernel_name += "x" + std::to_string(specs2d.threads_per_transform);
+
+    if(specs.half_lds)
+        kernel_name += "_halfLds";
 
-    if(static_dim)
+    if(specs.static_dim)
     {
         kernel_name += "_dim";
-        kernel_name += std::to_string(static_dim);
+        kernel_name += std::to_string(specs.static_dim);
     }
 
     kernel_name += rtc_precision_name(precision);
@@ -133,6 +160,21 @@
         throw std::runtime_error("unsupported scheme in stockham_rtc_kernel_name");
     }
 
+    switch(fuseBlue)
+    {
+    case BFT_NONE:
+        break;
+    case BFT_FWD_CHIRP:
+        kernel_name += "_fwd_chirp";
+        break;
+    case BFT_FWD_CHIRP_MUL:
+        kernel_name += "_fwd_chirp_mul";
+        break;
+    case BFT_INV_CHIRP_MUL:
+        kernel_name += "_inv_chirp_mul";
+        break;
+    }
+
     switch(transpose_type)
     {
     case NONE:
@@ -213,14 +255,19 @@
                          IntrinsicAccessType           intrinsicMode,
                          SBRC_TRANSPOSE_TYPE           transpose_type,
                          bool                          enable_callbacks,
-                         bool                          enable_scaling)
+                         bool                          enable_scaling,
+                         const BluesteinFuseType&      fuseBlue)
 {
     std::unique_ptr<Function> lds2reg, reg2lds, device;
     std::unique_ptr<Function> lds2reg1, reg2lds1, device1;
+    std::unique_ptr<Function> bluestein_load, bluestein_intrinsic_load;
+    std::unique_ptr<Function> bluestein_store, bluestein_intrinsic_store;
     std::unique_ptr<Function> global;
 
     std::vector<unsigned int> all_factors;
 
+    auto fuseBluestein = (fuseBlue != BFT_NONE);
+
     if(scheme == CS_KERNEL_2D_SINGLE)
     {
         StockhamKernelFused2D kernel(specs, specs2d);
@@ -251,19 +298,18 @@
         if(scheme == CS_KERNEL_STOCKHAM)
             kernel = std::make_unique<StockhamKernelRR>(specs);
         else if(scheme == CS_KERNEL_STOCKHAM_BLOCK_CC)
-            kernel = std::make_unique<StockhamKernelCC>(specs, largeTwdBatchIsTransformCount);
+            kernel = std::make_unique<StockhamKernelCC>(
+                specs, largeTwdBatchIsTransformCount, fuseBluestein);
         else if(scheme == CS_KERNEL_STOCKHAM_BLOCK_CR)
             kernel = std::make_unique<StockhamKernelCR>(specs);
         else if(scheme == CS_KERNEL_STOCKHAM_BLOCK_RC)
-        {
-            kernel = std::make_unique<StockhamKernelRC>(specs);
-        }
+            kernel = std::make_unique<StockhamKernelRC>(specs, fuseBluestein);
         else if(scheme == CS_KERNEL_STOCKHAM_TRANSPOSE_XY_Z)
-            kernel = std::make_unique<StockhamKernelRC>(specs);
+            kernel = std::make_unique<StockhamKernelRC>(specs, false);
         else if(scheme == CS_KERNEL_STOCKHAM_TRANSPOSE_Z_XY)
-            kernel = std::make_unique<StockhamKernelRC>(specs);
+            kernel = std::make_unique<StockhamKernelRC>(specs, false);
         else if(scheme == CS_KERNEL_STOCKHAM_R_TO_CMPLX_TRANSPOSE_Z_XY)
-            kernel = std::make_unique<StockhamKernelRC>(specs);
+            kernel = std::make_unique<StockhamKernelRC>(specs, false);
         else
             throw std::runtime_error("unhandled scheme");
         if(transforms_per_block)
@@ -271,7 +317,25 @@
         lds2reg = std::make_unique<Function>(kernel->generate_lds_to_reg_input_function());
         reg2lds = std::make_unique<Function>(kernel->generate_lds_from_reg_output_function());
         device  = std::make_unique<Function>(kernel->generate_device_function());
-        global  = std::make_unique<Function>(kernel->generate_global_function());
+
+        if(fuseBluestein)
+        {
+            auto planar_blue_load = array_type_is_planar(inArrayType);
+            bluestein_load = std::make_unique<Function>(generate_bluestein_device_load_function(
+                scheme, fuseBlue, direction, planar_blue_load, false));
+            bluestein_intrinsic_load
+                = std::make_unique<Function>(generate_bluestein_device_load_function(
+                    scheme, fuseBlue, direction, planar_blue_load, true));
+
+            auto planar_blue_store = array_type_is_planar(outArrayType);
+            bluestein_store = std::make_unique<Function>(generate_bluestein_device_store_function(
+                scheme, fuseBlue, direction, planar_blue_store, false));
+            bluestein_intrinsic_store
+                = std::make_unique<Function>(generate_bluestein_device_store_function(
+                    scheme, fuseBlue, direction, planar_blue_store, true));
+        }
+
+        global = std::make_unique<Function>(kernel->generate_global_function());
 
         // get factors vector
         all_factors = kernel->factors;
@@ -300,8 +364,12 @@
             *global = make_planar(*global, "buf");
     }
 
+    if(fuseBluestein)
+        *global = make_bluestein(scheme, fuseBlue, *global);
+
     // start off with includes
     std::string src;
+    src += rocfft_complex_h;
     src += common_h;
     src += memory_gfx_h;
     src += callback_h;
@@ -316,8 +384,6 @@
     if(scheme != CS_KERNEL_STOCKHAM_BLOCK_CC)
         src += real2complex_device_h;
 
-    src += rtc_workarounds_h;
-
     src += lds2reg->render();
     src += reg2lds->render();
     src += device->render();
@@ -327,6 +393,14 @@
         src += reg2lds1->render();
     if(device1)
         src += device1->render();
+    if(bluestein_load)
+        src += bluestein_load->render();
+    if(bluestein_intrinsic_load)
+        src += bluestein_intrinsic_load->render();
+    if(bluestein_store)
+        src += bluestein_store->render();
+    if(bluestein_intrinsic_store)
+        src += bluestein_intrinsic_store->render();
 
     // make_rtc removes templates from global function - add typedefs
     // and constants to replace them
diff -Nru rocfft-5.5.0/library/src/rtc_stockham_kernel.cpp rocfft-5.7.1/library/src/rtc_stockham_kernel.cpp
--- rocfft-5.5.0/library/src/rtc_stockham_kernel.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/rtc_stockham_kernel.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -32,8 +32,8 @@
                                                               const std::string& gpu_arch,
                                                               bool               enable_callbacks)
 {
-    RTCGenerator   generator;
-    function_pool& pool = function_pool::get_function_pool();
+    RTCStockhamGenerator generator;
+    function_pool&       pool = function_pool::get_function_pool();
 
     std::optional<StockhamGeneratorSpecs> specs;
     std::optional<StockhamGeneratorSpecs> specs2d;
@@ -41,7 +41,9 @@
     // if scale factor is enabled, we force RTC for this kernel
     bool enable_scaling = node.IsScalingEnabled();
 
-    SBRC_TRANSPOSE_TYPE transpose_type = NONE;
+    // for sbrc variant, the sbrcTranstype should be assigned when we are here
+    // since the value is assigned in KernelCheck()
+    SBRC_TRANSPOSE_TYPE transpose_type = node.sbrcTranstype;
 
     // SBRC variants look in the function pool for plain BLOCK_RC to
     // learn the block width, then decide on the transpose type once
@@ -61,8 +63,12 @@
 
     std::optional<FFTKernel> kernel;
 
+    bool is_pre_compiled = false;
+
     // find function pool entry so we can construct specs for the generator
+    // NB: make sure all SBRC-type node have the correct trans_type value
     FMKey key;
+    key = node.GetKernelKey();
     switch(pool_scheme)
     {
     case CS_KERNEL_STOCKHAM:
@@ -70,25 +76,18 @@
     case CS_KERNEL_STOCKHAM_BLOCK_CR:
     case CS_KERNEL_STOCKHAM_BLOCK_RC:
     {
+        if((pool_scheme == CS_KERNEL_STOCKHAM_BLOCK_RC) && (transpose_type == NONE))
+            throw std::runtime_error("Invalid SBRC_TRANS_TYPE for SBRC kernel");
+
         // these go into the function pool normally and are passed to
         // the generator as-is
-        key    = fpkey(node.length[0], node.precision, pool_scheme);
         kernel = pool.get_kernel(key);
         // if a kernel is already precompiled, just use that.  but
         // changing largeTwdBatch transform count requires RTC, so we
         // can't use a precompiled kernel in that case.
         if(kernel->device_function && !enable_scaling && !node.largeTwdBatchIsTransformCount)
         {
-            return generator;
-        }
-
-        // for SBRC variants, get the "real" kernel using the block
-        // width and correct transpose type
-        if(pool_scheme == CS_KERNEL_STOCKHAM_BLOCK_RC)
-        {
-            transpose_type = node.sbrc_transpose_type(kernel->transforms_per_block);
-            key            = fpkey(node.length[0], node.precision, node.scheme, transpose_type);
-            kernel         = pool.get_kernel(key);
+            is_pre_compiled = true;
         }
 
         std::vector<unsigned int> factors;
@@ -107,12 +106,11 @@
     }
     case CS_KERNEL_2D_SINGLE:
     {
-        key    = fpkey(node.length[0], node.length[1], node.precision, node.scheme);
         kernel = pool.get_kernel(key);
         // already precompiled?
         if(kernel->device_function && !enable_scaling)
         {
-            return generator;
+            is_pre_compiled = true;
         }
 
         std::vector<unsigned int> factors1d;
@@ -153,6 +151,7 @@
     }
     default:
     {
+        // no supported scheme, not the correct type
         return generator;
     }
     }
@@ -166,13 +165,17 @@
         static_dim = 0;
     specs->static_dim = static_dim;
 
+    // mark wgs as derived already so generator won't change it again
+    specs->wgs_is_derived = true;
+    if(specs2d)
+        specs2d->wgs_is_derived = true;
+
     bool unit_stride = node.inStride.front() == 1 && node.outStride.front() == 1;
 
     generator.generate_name = [=, &node]() {
-        return stockham_rtc_kernel_name(node.scheme,
-                                        node.length[0],
-                                        node.scheme == CS_KERNEL_2D_SINGLE ? node.length[1] : 0,
-                                        static_dim,
+        return stockham_rtc_kernel_name(*specs,
+                                        specs2d ? *specs2d : *specs,
+                                        node.scheme,
                                         node.direction,
                                         node.precision,
                                         node.placement,
@@ -187,9 +190,14 @@
                                         node.intrinsicMode,
                                         transpose_type,
                                         enable_callbacks,
-                                        node.IsScalingEnabled());
+                                        node.IsScalingEnabled(),
+                                        node.fuseBlue);
     };
 
+    // if is pre-compiled, we assign the name-function only
+    if(is_pre_compiled)
+        return generator;
+
     generator.generate_src = [=, &node](const std::string& kernel_name) {
         return stockham_rtc(*specs,
                             specs2d ? *specs2d : *specs,
@@ -210,7 +218,8 @@
                             node.intrinsicMode,
                             transpose_type,
                             enable_callbacks,
-                            node.IsScalingEnabled());
+                            node.IsScalingEnabled(),
+                            node.fuseBlue);
     };
 
     generator.construct_rtckernel
@@ -259,5 +268,81 @@
         if(array_type_is_planar(data.node->outArrayType))
             kargs.append_ptr(data.bufOut[1]);
     }
+    // fused bluestein data (chirp table and lengths)
+    switch(data.node->fuseBlue)
+    {
+    case BFT_NONE:
+        break;
+    case BFT_FWD_CHIRP:
+    case BFT_FWD_CHIRP_MUL:
+        if(data.node->scheme == CS_KERNEL_STOCKHAM_BLOCK_CC)
+            kargs.append_ptr(data.node->chirp);
+
+        kargs.append_size_t(data.node->lengthBlueN);
+        kargs.append_size_t(data.node->lengthBlue);
+
+        break;
+    case BFT_INV_CHIRP_MUL:
+        if(data.node->scheme == CS_KERNEL_STOCKHAM_BLOCK_RC)
+            kargs.append_ptr(data.node->chirp);
+
+        kargs.append_size_t(data.node->lengthBlueN);
+        kargs.append_size_t(data.node->lengthBlue);
+
+        break;
+    }
+    // fused bluestein data (strides and dists)
+    if(data.node->fuseBlue != BFT_NONE)
+    {
+        size_t empty_val = 0;
+
+        if(data.node->fuseBlue == BFT_FWD_CHIRP)
+        {
+            kargs.append_size_t(empty_val);
+            kargs.append_size_t(empty_val);
+            kargs.append_size_t(empty_val);
+
+            kargs.append_size_t(empty_val);
+            kargs.append_size_t(empty_val);
+            kargs.append_size_t(empty_val);
+        }
+        else
+        {
+            assert(data.node->inStrideBlue.size() == data.node->outStrideBlue.size());
+            switch(data.node->inStrideBlue.size())
+            {
+            case 2: // 1D FFT
+                kargs.append_size_t(empty_val);
+                kargs.append_size_t(empty_val);
+                kargs.append_size_t(data.node->iDistBlue);
+
+                kargs.append_size_t(empty_val);
+                kargs.append_size_t(empty_val);
+                kargs.append_size_t(data.node->oDistBlue);
+                break;
+            case 3: // 2D FFT
+                kargs.append_size_t(data.node->inStrideBlue[2]);
+                kargs.append_size_t(empty_val);
+                kargs.append_size_t(data.node->iDistBlue);
+
+                kargs.append_size_t(data.node->outStrideBlue[2]);
+                kargs.append_size_t(empty_val);
+                kargs.append_size_t(data.node->oDistBlue);
+                break;
+            case 4: // 3D FFT
+                kargs.append_size_t(data.node->inStrideBlue[2]);
+                kargs.append_size_t(data.node->inStrideBlue[3]);
+                kargs.append_size_t(data.node->iDistBlue);
+
+                kargs.append_size_t(data.node->outStrideBlue[2]);
+                kargs.append_size_t(data.node->outStrideBlue[3]);
+                kargs.append_size_t(data.node->oDistBlue);
+                break;
+            default:
+                throw std::runtime_error("Invalid strides for Bluestein kernel");
+            }
+        }
+    }
+
     return kargs;
 }
diff -Nru rocfft-5.5.0/library/src/rtc_subprocess.cpp rocfft-5.7.1/library/src/rtc_subprocess.cpp
--- rocfft-5.5.0/library/src/rtc_subprocess.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/rtc_subprocess.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -53,6 +53,10 @@
 typedef int        file_handle_type;
 #endif
 
+#define TO_STR2(x) #x
+#define TO_STR(x) TO_STR2(x)
+#define ROCFFT_VERSION_STRING TO_STR(ROCFFT_VERSION)
+
 static fs::path find_rtc_helper()
 {
     // candidate directories for the helper
@@ -69,9 +73,9 @@
         fs::path library_parent_path = library_path.parent_path();
         helper_dirs.push_back(library_parent_path);
 
-        // try bin dir, one dir up from library
-        fs::path bin_path = library_parent_path.parent_path() / "bin";
-        helper_dirs.push_back(bin_path);
+        // try in a versioned library subdirectory
+        fs::path subdir_path = library_parent_path / "rocfft" / ROCFFT_VERSION_STRING;
+        helper_dirs.push_back(subdir_path);
 
         // look for helper in the candidate directories
         for(const auto& dir : helper_dirs)
@@ -356,7 +360,6 @@
     pid_t pid = 0;
     char* argv[]
         = {const_cast<char*>(rtc_helper_exe.c_str()), const_cast<char*>(gpu_arch.c_str()), 0};
-    char* envp[] = {nullptr};
 
     // set up child's stdin/stdout
     posix_spawn_file_actions_t spawn_file_actions;
@@ -365,7 +368,7 @@
     posix_spawn_file_actions_adddup2(&spawn_file_actions, child_stdout_write, STDOUT_FILENO);
 
     int spawn_result
-        = posix_spawn(&pid, rtc_helper_exe.c_str(), &spawn_file_actions, nullptr, argv, envp);
+        = posix_spawn(&pid, rtc_helper_exe.c_str(), &spawn_file_actions, nullptr, argv, environ);
     posix_spawn_file_actions_destroy(&spawn_file_actions);
     if(spawn_result != 0)
     {
diff -Nru rocfft-5.5.0/library/src/rtc_transpose_gen.cpp rocfft-5.7.1/library/src/rtc_transpose_gen.cpp
--- rocfft-5.5.0/library/src/rtc_transpose_gen.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/rtc_transpose_gen.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -80,6 +80,7 @@
     std::string src;
 
     // includes and declarations
+    src += rocfft_complex_h;
     src += common_h;
     src += callback_h;
 
diff -Nru rocfft-5.5.0/library/src/rtc_twiddle_gen.cpp rocfft-5.7.1/library/src/rtc_twiddle_gen.cpp
--- rocfft-5.5.0/library/src/rtc_twiddle_gen.cpp	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/rtc_twiddle_gen.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,209 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "rtc_twiddle_gen.h"
+#include "device/kernel-generator-embed.h"
+#include "rtc_kernel.h"
+#include "twiddles.h"
+
+std::string twiddle_rtc_kernel_name(TwiddleTableType type, rocfft_precision precision)
+{
+    std::string kernel_name = "twiddle_gen";
+    switch(type)
+    {
+    case TwiddleTableType::RADICES:
+        kernel_name += "_radices";
+        break;
+    case TwiddleTableType::LENGTH_N:
+        kernel_name += "_N";
+        break;
+    case TwiddleTableType::HALF_N:
+        kernel_name += "_half_N";
+        break;
+    case TwiddleTableType::LARGE:
+        kernel_name += "_large";
+        break;
+    }
+    kernel_name += rtc_precision_name(precision);
+    return kernel_name;
+}
+
+const char* twiddle_rtc_header = "extern \"C\" __global__ void ";
+
+static std::string twiddle_rtc_launch_bounds(TwiddleTableType type)
+{
+    std::string bounds = "__launch_bounds__(";
+    switch(type)
+    {
+    case TwiddleTableType::RADICES:
+    case TwiddleTableType::LARGE:
+        bounds += std::to_string(TWIDDLES_THREADS * TWIDDLES_THREADS);
+        break;
+    case TwiddleTableType::LENGTH_N:
+    case TwiddleTableType::HALF_N:
+        bounds += std::to_string(TWIDDLES_THREADS);
+        break;
+    }
+    bounds += ") ";
+    return bounds;
+}
+
+static std::string twiddle_rtc_args(TwiddleTableType type, rocfft_precision precision)
+{
+    std::string args = "(";
+    switch(type)
+    {
+    case TwiddleTableType::RADICES:
+        args += "size_t length_limit";
+        args += ", size_t num_radices";
+        args += ", radices_t radices";
+        args += ", radices_t radices_prod";
+        args += ", radices_t radices_sum_prod";
+        args += ", scalar_type* output";
+        break;
+    case TwiddleTableType::LENGTH_N:
+        args += "size_t length_limit";
+        args += ", size_t N";
+        args += ", scalar_type* output";
+        break;
+    case TwiddleTableType::HALF_N:
+        args += "size_t half_N";
+        args += ", size_t N";
+        args += ", scalar_type* output";
+        break;
+    case TwiddleTableType::LARGE:
+        args += "double phi";
+        args += ", size_t base";
+        args += ", size_t X";
+        args += ", size_t Y";
+        args += ", scalar_type* output";
+        break;
+    }
+    args += ")";
+    return args;
+}
+
+static std::string twiddle_rtc_body(TwiddleTableType type)
+{
+    std::string body = "{";
+    switch(type)
+    {
+    case TwiddleTableType::RADICES:
+        body += R"_SRC(
+        auto i = threadIdx.x + blockIdx.x * blockDim.x;
+
+        if(i < num_radices - 1)
+        {
+            auto L     = radices_prod.data[i];
+            auto radix = radices.data[i + 1];
+            auto k     = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if(k < L / radix)
+            {
+                double theta = TWO_PI * (k) / (L);
+                auto   index = radices_sum_prod.data[i] + k * (radices.data[i + 1] - 1);
+
+                for(size_t j = 1; j < radix && index < length_limit; ++j)
+                {
+                    output[index].x = cos((j)*theta);
+                    output[index].y = sin((j)*theta);
+
+                    ++index;
+                }
+            }
+        }
+        )_SRC";
+        break;
+    case TwiddleTableType::LENGTH_N:
+        body += R"_SRC(
+        auto i = threadIdx.x + blockIdx.x * blockDim.x;
+
+        if(i < N && i < length_limit)
+        {
+            double c = cos(TWO_PI * i / N);
+            double s = sin(TWO_PI * i / N);
+
+            output[i].x = c;
+            output[i].y = s;
+        }
+        )_SRC";
+        break;
+    case TwiddleTableType::HALF_N:
+        body += R"_SRC(
+        auto i = threadIdx.x + blockIdx.x * blockDim.x;
+
+        if(i < half_N)
+        {
+            double c = cos(TWO_PI * i / (2 * N));
+            double s = sin(TWO_PI * i / (2 * N));
+
+            output[i].x = c;
+            output[i].y = s;
+        }
+        )_SRC";
+        break;
+    case TwiddleTableType::LARGE:
+        body += R"_SRC(
+        auto iY = threadIdx.y + blockIdx.y * blockDim.y;
+
+        if(iY < Y)
+        {
+            auto iX = threadIdx.x + blockIdx.x * blockDim.x;
+
+            if(iX < X)
+            {
+                auto j = (static_cast<size_t>(1) << (iY * base)) * iX;
+
+                double c = cos(phi * j);
+                double s = sin(phi * j);
+
+                auto index = iY * X + iX;
+
+                output[index].x = c;
+                output[index].y = s;
+            }
+        }
+        )_SRC";
+        break;
+    }
+    body += "}";
+    return body;
+}
+
+std::string
+    twiddle_rtc(const std::string& kernel_name, TwiddleTableType type, rocfft_precision precision)
+{
+    std::string src;
+
+    src += rocfft_complex_h;
+    src += common_h;
+    src += rtc_precision_type_decl(precision);
+    src += "static constexpr double TWO_PI = -6.283185307179586476925286766559;\n";
+    src += "static const unsigned int TWIDDLES_MAX_RADICES = "
+           + std::to_string(TWIDDLES_MAX_RADICES) + ";\n";
+
+    src += radices_t_str;
+    src += twiddle_rtc_header;
+    src += twiddle_rtc_launch_bounds(type);
+    src += kernel_name;
+    src += twiddle_rtc_args(type, precision);
+    src += twiddle_rtc_body(type);
+    return src;
+}
diff -Nru rocfft-5.5.0/library/src/rtc_twiddle_kernel.cpp rocfft-5.7.1/library/src/rtc_twiddle_kernel.cpp
--- rocfft-5.5.0/library/src/rtc_twiddle_kernel.cpp	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/rtc_twiddle_kernel.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,37 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "rtc_twiddle_kernel.h"
+#include "device/kernel-generator-embed.h"
+#include "rtc_cache.h"
+
+RTCKernelTwiddle RTCKernelTwiddle::generate(const std::string& gpu_arch,
+                                            TwiddleTableType   type,
+                                            rocfft_precision   precision)
+{
+    auto kernel_name = twiddle_rtc_kernel_name(type, precision);
+
+    kernel_src_gen_t generator{
+        [=](const std::string& kernel_name) { return twiddle_rtc(kernel_name, type, precision); }};
+
+    auto code = cached_compile(kernel_name, gpu_arch, generator, generator_sum());
+
+    return RTCKernelTwiddle{kernel_name, code, {}, {}};
+}
diff -Nru rocfft-5.5.0/library/src/solution_map.cpp rocfft-5.7.1/library/src/solution_map.cpp
--- rocfft-5.5.0/library/src/solution_map.cpp	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/solution_map.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,799 @@
+/******************************************************************************
+* Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*******************************************************************************/
+
+#include "solution_map.h"
+#include "../../shared/environment.h"
+#include "data_descriptor.h"
+#include "library_path.h"
+#include "logging.h"
+#include "node_factory.h"
+
+#include <fstream>
+
+namespace fs = std::filesystem;
+
+#define REGEX "[^:;,\"\\{\\}\\[\\s]+"
+
+static const char* def_solution_map_path = "rocfft_solution_map.dat";
+
+const int   solution_map::VERSION                       = 1;
+const char* solution_map::KERNEL_TOKEN_BUILTIN_KERNEL   = "kernel_token_builtin_kernel";
+const char* solution_map::LEAFNODE_TOKEN_BUILTIN_KERNEL = "leafnode_token_builtin_kernel";
+
+static std::map<SolutionNodeType, std::string> SolutionNodeTypetoStrMap()
+{
+    std::map<SolutionNodeType, std::string> SNTtoStr = {{SOL_DUMMY, "SOL_DUMMY"},
+                                                        {SOL_INTERNAL_NODE, "SOL_INTERNAL_NODE"},
+                                                        {SOL_LEAF_NODE, "SOL_LEAF_NODE"},
+                                                        {SOL_KERNEL_ONLY, "SOL_KERNEL_ONLY"},
+                                                        {SOL_BUILTIN_KERNEL, "SOL_BUILTIN_KERNEL"}};
+    return SNTtoStr;
+}
+
+static std::map<std::string, SolutionNodeType> StrToSolutionNodeTypeMap()
+{
+    std::map<std::string, SolutionNodeType> StrToSNT;
+    for(auto i : SolutionNodeTypetoStrMap())
+        StrToSNT.emplace(i.second, i.first);
+    return StrToSNT;
+}
+
+std::string PrintSolutionNodeType(const SolutionNodeType snt)
+{
+    static auto SNTtoString = SolutionNodeTypetoStrMap();
+    return SNTtoString.at(snt);
+}
+
+SolutionNodeType StrToSolutionNodeType(const std::string& str)
+{
+    static auto str2SNT = StrToSolutionNodeTypeMap();
+    return str2SNT.at(str);
+}
+
+static fs::path get_solution_map_path(const std::string& read_folder,
+                                      const std::string& arch = "any")
+{
+    // find file in that folder,
+    fs::path folder_path(read_folder.c_str());
+
+    // search the file with the arch prefix
+    std::string prefix(arch + "_");
+    fs::path    file_name(prefix.c_str());
+    file_name += def_solution_map_path;
+
+    return folder_path / file_name;
+}
+
+template <>
+struct ToString<SolutionPtr>
+{
+    std::string print(const SolutionPtr& value) const
+    {
+        std::string str = "{";
+        str += FieldDescriptor<std::string>().describe("child_token", value.child_token) + ",";
+        str += FieldDescriptor<size_t>().describe("child_option", value.child_option);
+        str += "}";
+        return str;
+    }
+};
+
+template <>
+struct FromString<SolutionPtr>
+{
+    void Get(SolutionPtr& ret, std::sregex_token_iterator& current) const
+    {
+        FieldParser<std::string>().parse("child_token", ret.child_token, current);
+        FieldParser<size_t>().parse("child_option", ret.child_option, current);
+    }
+};
+
+template <>
+struct ToString<SolutionNode>
+{
+    std::string print(const SolutionNode& value) const
+    {
+        std::string str = "{";
+        str += FieldDescriptor<std::string>().describe("sol_node_type",
+                                                       PrintSolutionNodeType(value.sol_node_type));
+
+        if(value.sol_node_type != SOL_BUILTIN_KERNEL)
+        {
+            str += ",";
+            if(value.sol_node_type == SOL_KERNEL_ONLY)
+            {
+                str += FieldDescriptor<FMKey>().describe("kernel_key", value.kernel_key);
+            }
+            else
+            {
+                str += FieldDescriptor<std::string>().describe("using_scheme",
+                                                               PrintScheme(value.using_scheme))
+                       + ",";
+                str += VectorFieldDescriptor<SolutionPtr>().describe("solution_childnodes",
+                                                                     value.solution_childnodes);
+            }
+        }
+
+        str += "}";
+        return str;
+    }
+};
+
+template <>
+struct FromString<SolutionNode>
+{
+    void Get(SolutionNode& ret, std::sregex_token_iterator& current) const
+    {
+        std::string sol_node_type_str;
+        std::string scheme_str;
+
+        FieldParser<std::string>().parse("sol_node_type", sol_node_type_str, current);
+        ret.sol_node_type = StrToSolutionNodeType(sol_node_type_str);
+
+        if(ret.sol_node_type != SOL_BUILTIN_KERNEL)
+        {
+            if(ret.sol_node_type == SOL_KERNEL_ONLY)
+            {
+                FieldParser<FMKey>().parse("kernel_key", ret.kernel_key, current);
+                ret.using_scheme = std::get<2>(ret.kernel_key);
+            }
+            else
+            {
+                FieldParser<std::string>().parse("using_scheme", scheme_str, current);
+                ret.using_scheme = StrToComputeScheme(scheme_str);
+
+                VectorFieldParser<SolutionPtr>().parse(
+                    "solution_childnodes", ret.solution_childnodes, current);
+            }
+        }
+    }
+};
+
+template <>
+struct ToString<ProblemKey>
+{
+    std::string print(const ProblemKey& value) const
+    {
+        std::string str = "{";
+        str += FieldDescriptor<std::string>().describe("arch", value.arch) + ",";
+        str += FieldDescriptor<std::string>().describe("token", value.probToken);
+        str += "}";
+        return str;
+    }
+};
+
+template <>
+struct FromString<ProblemKey>
+{
+    void Get(ProblemKey& ret, std::sregex_token_iterator& current) const
+    {
+        std::string arch, token;
+        FieldParser<std::string>().parse("arch", arch, current);
+        FieldParser<std::string>().parse("token", token, current);
+        ret = {arch, token};
+    }
+};
+
+template <>
+struct ToString<SolMapEntry>
+{
+    std::string print(const SolMapEntry& value) const
+    {
+        static const std::string blanks(14, ' ');
+
+        std::string str = "\n{";
+        str += FieldDescriptor<ProblemKey>().describe("Problem", value.first) + ",\n ";
+        str += VectorFieldDescriptor<SolutionNode>().describe(
+            "Solutions", value.second, true, blanks);
+        str += "}";
+        return str;
+    }
+};
+
+template <>
+struct FromString<SolMapEntry>
+{
+    void Get(SolMapEntry& ret, std::sregex_token_iterator& current) const
+    {
+        FieldParser<ProblemKey>().parse("Problem", ret.first, current);
+        VectorFieldParser<SolutionNode>().parse("Solutions", ret.second, current);
+    }
+};
+
+//////////////////////
+// Private Functions
+//////////////////////
+// private version called by constructor
+size_t solution_map::add_solution_private(const ProblemKey& probKey, const SolutionNode& solution)
+{
+    // no this key, emplace one new vector
+    if(primary_sol_map.count(probKey) == 0)
+        primary_sol_map.emplace(probKey, SolutionNodeVec());
+
+    auto& sol_vec = primary_sol_map.at(probKey);
+    sol_vec.push_back(solution);
+    return sol_vec.size() - 1;
+}
+
+bool solution_map::SolutionNodesAreEqual(const SolutionNode& lhs,
+                                         const SolutionNode& rhs,
+                                         const std::string&  arch,
+                                         bool                primary_map)
+{
+    // NB:
+    // std::tie couldn't compare the .size() so we compare .size() outside std::tie
+    bool members_equal = std::tie(lhs.sol_node_type, lhs.using_scheme, lhs.kernel_key)
+                         == std::tie(rhs.sol_node_type, rhs.using_scheme, rhs.kernel_key);
+    if((members_equal == false)
+       || (lhs.solution_childnodes.size() != rhs.solution_childnodes.size()))
+        return false;
+
+    for(size_t i = 0; i < lhs.solution_childnodes.size(); ++i)
+    {
+        auto& lhs_child_ptr = lhs.solution_childnodes[i];
+        auto& rhs_child_ptr = rhs.solution_childnodes[i];
+
+        auto lhs_child_key = ProblemKey(arch, lhs_child_ptr.child_token);
+        auto rhs_child_key = ProblemKey(arch, rhs_child_ptr.child_token);
+
+        if(SolutionNodesAreEqual(
+               get_solution_node(lhs_child_key, lhs_child_ptr.child_option, primary_map),
+               get_solution_node(rhs_child_key, rhs_child_ptr.child_option, primary_map),
+               arch,
+               primary_map)
+           == false)
+            return false;
+    }
+
+    return true;
+}
+
+bool solution_map::remove_solution_bottom_up(SolutionNodeVec& nodeVec,
+                                             SolutionNode&    node,
+                                             size_t           pos)
+{
+    node.to_be_removed = true;
+
+    // this node is going to be removed, so the following elements change position.
+    // We need to update their parent_sol_ptr 's option_id
+    size_t i = pos + 1;
+    for(; i < nodeVec.size(); ++i)
+    {
+        auto& ref_ptrs = nodeVec[i].parent_sol_ptrs;
+        for(auto& ptr : ref_ptrs)
+            ptr->child_option -= 1;
+    }
+
+    // remove node and its parent solution nodes
+    for(auto& parent : node.parent_sol_nodes)
+    {
+        SolutionNodeVec& vec = *(parent->self_vec);
+        auto             it  = std::find(vec.begin(), vec.end(), *parent);
+        size_t           idx = it - vec.begin();
+        // recursion
+        remove_solution_bottom_up(vec, *parent, idx);
+    }
+
+    return true;
+}
+
+void solution_map::generate_link_info()
+{
+    for(auto& [key, value] : primary_sol_map)
+    {
+        SolutionNodeVec& solNodeVec = value;
+        for(SolutionNode& node : solNodeVec)
+        {
+            // update the self_vec
+            node.self_vec = &solNodeVec;
+
+            for(auto& child_node_ptr : node.solution_childnodes)
+            {
+                ProblemKey    pKey(key.arch, child_node_ptr.child_token);
+                SolutionNode& child = get_solution_node(pKey, child_node_ptr.child_option);
+
+                // update children's parent infos
+                child.parent_sol_nodes.push_back(&node);
+                child.parent_sol_ptrs.push_back(&child_node_ptr);
+            }
+        }
+    }
+}
+
+//////////////////////
+// Public Functions
+//////////////////////
+void solution_map::setup()
+{
+    // if we have speicified an explicit file-path, then read from it,
+    std::string explict_read_path_str = rocfft_getenv("ROCFFT_READ_EXPLICIT_SOL_MAP_FILE");
+    if(!explict_read_path_str.empty())
+    {
+        fs::path read_from_path(explict_read_path_str.c_str());
+        read_solution_map_data(read_from_path);
+        return;
+    }
+
+    // set ROCFFT_READ_SOL_MAP_FROM_FOLDER to enable reading solution map text files in runtime
+    // default is empty
+    std::string read_folder_str = rocfft_getenv("ROCFFT_READ_SOL_MAP_FROM_FOLDER");
+    if(!read_folder_str.empty())
+    {
+        // read data from any_arch
+        auto sol_map_input = get_solution_map_path(read_folder_str);
+        read_solution_map_data(sol_map_input);
+
+        // read data from current arch
+        auto deviceProp = get_curr_device_prop();
+        sol_map_input   = get_solution_map_path(read_folder_str, get_arch_name(deviceProp));
+        read_solution_map_data(sol_map_input);
+    }
+}
+
+bool solution_map::has_solution_node(const ProblemKey& probKey, size_t option_id, bool primary_map)
+{
+    ProbSolMap& dst_map = (primary_map) ? primary_sol_map : temp_working_map;
+
+    // no this key
+    if(dst_map.count(probKey) == 0)
+        return false;
+
+    // no this option_id
+    SolutionNodeVec& solutions = dst_map.at(probKey);
+    return solutions.size() > option_id;
+}
+
+SolutionNode&
+    solution_map::get_solution_node(const ProblemKey& probKey, size_t option_id, bool primary_map)
+{
+    // be sure we have checked has_solution_node();
+    if(!has_solution_node(probKey, option_id, primary_map))
+        throw std::runtime_error(
+            "get_solution_node failed. the solution_node doesn't exist: ProbKey=(" + probKey.arch
+            + "," + probKey.probToken + "), option_id=" + std::to_string(option_id));
+
+    ProbSolMap& dst_map = (primary_map) ? primary_sol_map : temp_working_map;
+
+    SolutionNodeVec& solutions = dst_map.at(probKey);
+    return solutions[option_id];
+}
+
+FMKey&
+    solution_map::get_solution_kernel(const ProblemKey& probKey, size_t option_id, bool primary_map)
+{
+    // be sure we have checked has_solution_node();
+    ProbSolMap& dst_map = (primary_map) ? primary_sol_map : temp_working_map;
+
+    SolutionNodeVec& solutions = dst_map.at(probKey);
+    return solutions[option_id].kernel_key;
+}
+
+// setup a solution of a problem and insert to the map, should be called by a benchmarker
+size_t solution_map::add_solution(const ProblemKey&               probKey,
+                                  TreeNode*                       currentNode,
+                                  const std::vector<SolutionPtr>& children,
+                                  bool                            isRootProb,
+                                  bool                            check_dup,
+                                  bool                            primary_map)
+{
+    SolutionNode solution;
+    solution.using_scheme  = currentNode->scheme;
+    solution.sol_node_type = (currentNode->nodeType == NT_LEAF) ? SOL_LEAF_NODE : SOL_INTERNAL_NODE;
+    solution.solution_childnodes = children;
+
+    return add_solution(probKey, solution, isRootProb, check_dup, primary_map);
+}
+
+// setup a solution of a problem and insert to the map, should be called by a benchmarker
+size_t solution_map::add_solution(const ProblemKey& probKey,
+                                  const FMKey&      kernel_key,
+                                  bool              check_dup,
+                                  bool              primary_map)
+{
+    SolutionNode solution;
+    solution.using_scheme  = (kernel_key == EmptyFMKey) ? CS_NONE : std::get<2>(kernel_key);
+    solution.sol_node_type = (kernel_key == EmptyFMKey) ? SOL_BUILTIN_KERNEL : SOL_KERNEL_ONLY;
+    solution.kernel_key    = kernel_key;
+
+    return add_solution(probKey, solution, false, check_dup, primary_map);
+}
+
+// directly insert a solution of a problem to the map, should be called by a benchmarker
+size_t solution_map::add_solution(const ProblemKey&   probKey,
+                                  const SolutionNode& solution,
+                                  bool                isRootProb,
+                                  bool                check_dup,
+                                  bool                primary_map)
+{
+    ProbSolMap& dst_map = (primary_map) ? primary_sol_map : temp_working_map;
+
+    // CS_KERNEL_STOCKHAM could be a problem scheme but also a kernel.
+    bool is_problem_scheme = ComputeSchemeIsAProblem(solution.using_scheme)
+                             && (solution.sol_node_type != SOL_KERNEL_ONLY);
+
+    // no this key, emplace one new vector
+    if(dst_map.count(probKey) == 0)
+    {
+        dst_map.emplace(probKey, SolutionNodeVec());
+
+        // if this is a solution for a problem (not kernel or non-problem)
+        // then we insert a dummy one in the front, which is reserved for root-problem
+        if(is_problem_scheme)
+            dst_map.at(probKey).push_back(SolutionNode::DummySolutionNode());
+    }
+
+    auto& sol_vec = dst_map.at(probKey);
+
+    // append solution to the same key
+    // Root-solution never checks duplication since it will always be the first element
+    // So adding a root-solution = simply overwrite the first one
+    if(isRootProb)
+    {
+        // just a double-check
+        assert(is_problem_scheme && (sol_vec.size() > 0));
+        sol_vec[0] = solution;
+        return 0;
+    }
+    else if(check_dup)
+    {
+        // if the solution is not a problem-solution (i.e. kernel solution or non-problem)
+        // then there is no a "dummy solution" (or exclusive root-solution) in the vector,
+        // so we still start from 0, otherwise, start from option 1.
+        const std::string& arch            = probKey.arch;
+        size_t             check_option_id = (is_problem_scheme) ? 1 : 0;
+        for(; check_option_id < sol_vec.size(); ++check_option_id)
+        {
+            // find an existing solution that is identical, then don't insert, simply return that option
+            if(SolutionNodesAreEqual(solution, sol_vec[check_option_id], arch, primary_map))
+                return check_option_id;
+        }
+    }
+
+    // if we are here, it could be either check_dup but not found any indentical,
+    // or force append, don't check_dup (for tuning)
+    sol_vec.push_back(solution);
+
+    return sol_vec.size() - 1;
+}
+
+// parse the format version of the input file, call by converter
+bool solution_map::get_solution_map_version(const fs::path& sol_map_in_path)
+{
+    static std::regex regEx(REGEX, std::regex_constants::optimize);
+
+    if(LOG_TRACE_ENABLED())
+        (*LogSingleton::GetInstance().GetTraceOS())
+            << "reading solution map data from: " << sol_map_in_path.c_str() << std::endl;
+
+    if(fs::exists(sol_map_in_path))
+    {
+        std::ifstream in_file(sol_map_in_path.c_str());
+        std::string   line;
+
+        while(std::getline(in_file, line))
+        {
+            std::size_t found = line.find("Version");
+            if(found != std::string::npos)
+            {
+                std::sregex_token_iterator tokens{line.begin(), line.end(), regEx, 0};
+                FieldParser<int>().parse("Version", self_version, tokens);
+                break;
+            }
+        }
+        return true;
+    }
+
+    return false;
+}
+
+// read the map from input stream
+bool solution_map::read_solution_map_data(const fs::path& sol_map_in_path, bool primary_map)
+{
+    static std::regex regEx(REGEX, std::regex_constants::optimize);
+
+    if(LOG_TRACE_ENABLED())
+        (*LogSingleton::GetInstance().GetTraceOS())
+            << "reading solution map data from: " << sol_map_in_path.c_str() << std::endl;
+
+    // Read text from the file. If file not found, do nothing
+    std::string solution_map_text = "";
+    if(fs::exists(sol_map_in_path))
+    {
+        std::ifstream in_file(sol_map_in_path.c_str());
+        std::string   line;
+
+        while(std::getline(in_file, line))
+        {
+            solution_map_text += line;
+        }
+    }
+
+    if(solution_map_text.back() == ']')
+        solution_map_text.resize(solution_map_text.size() - 1);
+
+    ProbSolMap& dst_map = (primary_map) ? primary_sol_map : temp_working_map;
+
+    std::sregex_token_iterator tokens{solution_map_text.begin(), solution_map_text.end(), regEx, 0};
+    std::sregex_token_iterator endIt;
+    if(tokens == endIt)
+    {
+        if(LOG_TRACE_ENABLED())
+            (*LogSingleton::GetInstance().GetTraceOS())
+                << "\tfile not found or file is empty" << std::endl;
+        return false;
+    }
+
+    // should be latest version as long as it's not called by converter
+    if(assume_latest_ver)
+    {
+        try
+        {
+            FieldParser<int>().parse("Version", self_version, tokens);
+            if(self_version != solution_map::VERSION)
+                throw std::runtime_error("format version of the input file is not the latest, "
+                                         "please execute the solution map converter first.");
+
+            // always do the latest version reading here
+            std::vector<SolMapEntry> entry_vec;
+            VectorFieldParser<SolMapEntry>().parse("Data", entry_vec, tokens);
+            for(auto& entry : entry_vec)
+                dst_map.emplace(entry.first, entry.second);
+        }
+        catch(const std::exception& e)
+        {
+            std::cerr << e.what() << std::endl;
+            return false;
+        }
+
+        return true;
+    }
+
+    /* only converter reaches here, and the self_version was already parsed outside.
+       we always put the latest reading in (assume_latest_ver) block, and move old reading below*/
+
+    // reading the oldest format: no version number, i.e, is 0
+    if(self_version == 0)
+    {
+        for(; tokens != endIt; ++tokens)
+        {
+            ProblemKey                probKey;
+            std::vector<SolutionNode> solutionVec;
+            FieldParser<ProblemKey>().parse("Problem", probKey, tokens);
+            VectorFieldParser<SolutionNode>().parse("Solutions", solutionVec, tokens);
+            dst_map.emplace(probKey, solutionVec);
+        }
+    }
+    // handling other version in the future
+    // else if() { ... }
+    // else if() { ... }
+
+    return true;
+}
+
+// write the map to output stream
+bool solution_map::write_solution_map_data(const fs::path& sol_map_out_path,
+                                           bool            sort,
+                                           bool            primary_map)
+{
+    if(LOG_TUNING_ENABLED())
+        (*LogSingleton::GetInstance().GetTuningOS())
+            << "writing solution map data to: " << sol_map_out_path.c_str() << std::endl;
+
+    std::ofstream outfile;
+    outfile.open(sol_map_out_path.c_str(), (std::ios::out | std::ios::trunc));
+    if(!outfile.is_open())
+        throw std::runtime_error("Write solution map failed. Cannot open/create output file: "
+                                 + sol_map_out_path.string());
+
+    std::stringstream ss;
+    ProbSolMap&       writing_map = (primary_map) ? primary_sol_map : temp_working_map;
+
+    std::vector<SolMapEntry> entry_vec;
+    for(auto& [key, value] : writing_map)
+        entry_vec.push_back(std::make_pair(key, value));
+
+    // sort !
+    if(sort)
+        std::sort(entry_vec.begin(), entry_vec.end(), ProbSolCmp);
+
+    // write version at the beginning
+    ss << "{";
+    ss << FieldDescriptor<int>().describe("Version", solution_map::VERSION) << "," << std::endl;
+    ss << VectorFieldDescriptor<SolMapEntry>().describe("Data", entry_vec) << std::endl;
+    ss << "}";
+
+    outfile << ss.str();
+    outfile.close();
+
+    return true;
+}
+
+bool solution_map::merge_solutions_from_file(const fs::path&                src_file,
+                                             const std::vector<ProblemKey>& root_probs)
+{
+    bool check_dup       = true;
+    bool read_to_primary = false;
+    if(read_solution_map_data(src_file, read_to_primary) == false)
+        return false;
+
+    // An important note is that we can't use SolutionNode& (alias) for the second arg,
+    // since a node may be shared with others (e.g. RTRT, where 2 Rs shared the same one)
+    // This Recur-Add-Solution is to add a sub-tree from mapA to mapB, so what we are doing
+    // here is not just moving but also updating the "option_id" of a node's child (children
+    // are also moved from mapA to mapB). If we pass by reference and update the option_id,
+    // Then we also change the data of an "un-moved" node (in mapA). So using call by copy
+    // is safer here.
+    auto RecursivelyAddSolution = [&](const ProblemKey& key,
+                                      SolutionNode      solution,
+                                      bool              isRoot,
+                                      bool              from_primary,
+                                      bool              to_primary,
+                                      auto&&            RecursivelyAddSolution) -> size_t {
+        std::string archName = key.arch;
+        for(auto& child : solution.solution_childnodes)
+        {
+            ProblemKey childKey(archName, child.child_token);
+
+            // get the child solution object from current solution map (using existing child_option)
+            auto& childSol = get_solution_node(childKey, child.child_option, from_primary);
+
+            // since we are add solution to another solution map , so we have to update the child option
+            child.child_option = RecursivelyAddSolution(
+                childKey, childSol, false, from_primary, to_primary, RecursivelyAddSolution);
+        }
+        return add_solution(key, solution, isRoot, check_dup, to_primary);
+    };
+
+    // for each root-problem, adding the whole solution-tree. from one map to another map
+    for(auto& rootProbKey : root_probs)
+    {
+        bool isRootProb     = true;
+        bool getFromPrimary = read_to_primary;
+        bool addToPrimary   = !getFromPrimary;
+
+        SolutionNode& sol_node = get_solution_node(rootProbKey, 0, getFromPrimary);
+
+        size_t option_id = RecursivelyAddSolution(rootProbKey,
+                                                  sol_node,
+                                                  isRootProb,
+                                                  getFromPrimary,
+                                                  addToPrimary,
+                                                  RecursivelyAddSolution);
+
+        // we are adding solution for a root-problem, so the root-solution should always be at index 0
+        if(option_id != 0)
+            throw std::runtime_error(
+                "Merge failed. Inserting solution of a root-problem should return an index 0");
+    }
+
+    return true;
+}
+
+//////////////////////
+// Version Converter
+//////////////////////
+bool SolutionMapConverter::remove_invalid_half_lds()
+{
+    static const std::set<ComputeScheme> no_half_lds
+        = {CS_KERNEL_STOCKHAM_BLOCK_CR,
+           CS_KERNEL_STOCKHAM_BLOCK_RC,
+           CS_KERNEL_STOCKHAM_TRANSPOSE_XY_Z,
+           CS_KERNEL_STOCKHAM_TRANSPOSE_Z_XY,
+           CS_KERNEL_STOCKHAM_R_TO_CMPLX_TRANSPOSE_Z_XY};
+
+    auto& sol_map = solution_map::get_solution_map();
+
+    // generate the parent's info for back reference
+    sol_map.generate_link_info();
+
+    // mark those nodes need removing
+    for(auto& [key, value] : sol_map.primary_sol_map)
+    {
+        SolutionNodeVec& solNodeVec = value;
+        for(size_t i = 0; i < solNodeVec.size(); ++i)
+        {
+            SolutionNode& node = solNodeVec[i];
+            if(node.sol_node_type == SOL_KERNEL_ONLY)
+            {
+                ComputeScheme scheme = std::get<2>(node.kernel_key);
+                KernelConfig& config = std::get<4>(node.kernel_key);
+                if(config.half_lds && no_half_lds.count(scheme))
+                {
+                    sol_map.remove_solution_bottom_up(solNodeVec, node, i);
+                }
+            }
+            else
+            {
+                break;
+            }
+        }
+    }
+
+    // erase nodes, and check if any entry need removing
+    std::set<ProblemKey> to_be_removed_keys;
+    for(auto& [key, value] : sol_map.primary_sol_map)
+    {
+        SolutionNodeVec& solNodeVec = value;
+        for(size_t i = 0; i < solNodeVec.size(); ++i)
+        {
+            SolutionNode& node = solNodeVec[i];
+            if(node.to_be_removed)
+            {
+                solNodeVec.erase(solNodeVec.begin() + i);
+                --i; // stay at the same position after removing this element
+            }
+        }
+        if(solNodeVec.empty())
+            to_be_removed_keys.insert(key);
+    }
+
+    // remove entries with zero-length sol-node-vec
+    for(const auto& key : to_be_removed_keys)
+        sol_map.primary_sol_map.erase(key);
+
+    return true;
+}
+
+bool SolutionMapConverter::VersionCheckAndConvert(const std::string& in_map_path,
+                                                  const std::string& out_map_path)
+{
+    auto& sol_map = solution_map::get_solution_map();
+
+    try
+    {
+        // don't assert latest version, so that we can read any version in read_solution_map_data()
+        sol_map.assume_latest_ver = false;
+
+        // parse the version individually first
+        if(sol_map.get_solution_map_version(in_map_path) == false)
+            return false;
+
+        // the read function should be able to read the file according to the parsed self_version
+        if(sol_map.read_solution_map_data(in_map_path) == false)
+            return false;
+
+        bool has_conversion = sol_map.self_version != solution_map::VERSION;
+
+        // ---------
+        // some actions that convert the current data to the latest one
+        if(sol_map.self_version == 0)
+            remove_invalid_half_lds();
+
+        // other actions that need to make it fitting latest version
+        // ---------
+
+        if(has_conversion)
+        {
+            std::cout << "successfully converted solution map from version(" << sol_map.self_version
+                      << ") to latest version(" << solution_map::VERSION << ").\n";
+            return sol_map.write_solution_map_data(out_map_path);
+        }
+        else
+            std::cout << "solution map is already at the latest version.\n";
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << e.what() << std::endl;
+        return false;
+    }
+
+    return true;
+}
\ No newline at end of file
diff -Nru rocfft-5.5.0/library/src/transform.cpp rocfft-5.7.1/library/src/transform.cpp
--- rocfft-5.5.0/library/src/transform.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/transform.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 /******************************************************************************
-* Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+* Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
diff -Nru rocfft-5.5.0/library/src/tree_node.cpp rocfft-5.7.1/library/src/tree_node.cpp
--- rocfft-5.5.0/library/src/tree_node.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/tree_node.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -19,6 +19,7 @@
 // THE SOFTWARE.
 
 #include "tree_node.h"
+#include "../../shared/precision_type.h"
 #include "function_pool.h"
 #include "kernel_launch.h"
 #include "logging.h"
@@ -40,6 +41,11 @@
         Repo::ReleaseTwiddle1D(twiddles_large);
         twiddles_large = nullptr;
     }
+    if(chirp)
+    {
+        Repo::ReleaseChirp(chirp);
+        chirp = nullptr;
+    }
 }
 
 NodeMetaData::NodeMetaData(TreeNode* refNode)
@@ -58,8 +64,8 @@
 {
     if(large1D != 0)
     {
-        std::tie(twiddles_large, twiddles_large_size)
-            = Repo::GetTwiddles1D(large1D, 0, precision, largeTwdBase, false, {});
+        std::tie(twiddles_large, twiddles_large_size) = Repo::GetTwiddles1D(
+            large1D, 0, precision, deviceProp.gcnArchName, largeTwdBase, false, {});
     }
 
     return true;
@@ -72,26 +78,73 @@
     return length[0];
 }
 
-void LeafNode::GetKernelFactors()
+FMKey LeafNode::GetKernelKey() const
 {
-    ComputeScheme _scheme = scheme;
-    if(_scheme == CS_KERNEL_STOCKHAM_TRANSPOSE_XY_Z || _scheme == CS_KERNEL_STOCKHAM_TRANSPOSE_Z_XY
-       || _scheme == CS_KERNEL_STOCKHAM_R_TO_CMPLX_TRANSPOSE_Z_XY)
-        _scheme = CS_KERNEL_STOCKHAM_BLOCK_RC;
+    if(!externalKernel)
+        return EmptyFMKey;
+
+    return TreeNode::GetKernelKey();
+}
 
-    FMKey key     = (dimension == 1) ? fpkey(length[0], precision, _scheme)
-                                     : fpkey(length[0], length[1], precision, _scheme);
+void LeafNode::GetKernelFactors()
+{
+    FMKey key     = GetKernelKey();
     kernelFactors = function_pool::get_kernel(key).factors;
 }
 
-bool LeafNode::KernelCheck()
+bool LeafNode::KernelCheck(std::vector<FMKey>& kernel_keys)
 {
     if(!externalKernel)
+    {
+        // such as solutions kernels for 2D_RTRT or 1D_CRT, the "T" kernel is not an external one
+        // so in the solution map we will keep it as a empty key. By storing and checking the emptykey,
+        // we can increase the reilability of solution map.
+        if(!kernel_keys.empty())
+        {
+            if(LOG_TRACE_ENABLED())
+                (*LogSingleton::GetInstance().GetTraceOS())
+                    << "solution kernel is an built-in kernel" << std::endl;
+
+            // kernel_key from solution map should be an EmptyFMKey for a built-in kernel
+            if(kernel_keys.front() != EmptyFMKey)
+                return false;
+            kernel_keys.erase(kernel_keys.begin());
+        }
         return true;
+    }
+
+    specified_key = nullptr;
+    if(!kernel_keys.empty())
+    {
+        FMKey assignedKey = kernel_keys.front();
+        kernel_keys.erase(kernel_keys.begin());
+
+        // check if the assigned key is consistent with the node information
+        const auto&            key_lengths   = std::get<0>(assignedKey);
+        const rocfft_precision key_precision = std::get<1>(assignedKey);
+        const ComputeScheme    key_scheme    = std::get<2>(assignedKey);
+        if((length[0] != key_lengths[0]) || (dimension == 2 && length[1] != key_lengths[1])
+           || (precision != key_precision) || (scheme != key_scheme))
+        {
+            if(LOG_TRACE_ENABLED())
+                (*LogSingleton::GetInstance().GetTraceOS())
+                    << "solution kernel keys are invalid" << std::endl;
+            return false;
+        }
+        else
+        {
+            // get the sbrc_trans_type from assignedKey (for sbrc)
+            sbrcTranstype = std::get<3>(assignedKey);
 
-    // check we have the kernel
-    FMKey key = (dimension == 1) ? fpkey(length[0], precision, scheme)
-                                 : fpkey(length[0], length[1], precision, scheme);
+            function_pool::add_new_kernel(assignedKey);
+            specified_key = std::make_unique<FMKey>(assignedKey);
+        }
+    }
+
+    // get the final key and check if we have the kernel.
+    // Note that the check is trivial if we are using "specified_key"
+    // since we definitly have the kernel, but not trivial if it's the auto-gen key
+    FMKey key = GetKernelKey();
     if(!function_pool::has_function(key))
     {
         if(LOG_TRACE_ENABLED())
@@ -108,12 +161,32 @@
     return true;
 }
 
-void LeafNode::SanityCheck()
+void LeafNode::SanityCheck(SchemeTree* solution_scheme, std::vector<FMKey>& kernels_keys)
 {
-    if(!KernelCheck())
-        throw std::runtime_error("Kernel not found");
+    if(!KernelCheck(kernels_keys))
+        throw std::runtime_error("Kernel not found or mismatches node (solution map issue)");
+
+    TreeNode::SanityCheck(solution_scheme, kernels_keys);
+}
 
-    TreeNode::SanityCheck();
+void LeafNode::Print(rocfft_ostream& os, int indent) const
+{
+    TreeNode::Print(os, indent);
+
+    std::string indentStr;
+    while(indent--)
+        indentStr += "    ";
+
+    os << indentStr.c_str() << "Leaf-Node: external-kernel configuration: ";
+    indentStr += "    ";
+    os << "\n" << indentStr.c_str() << "workgroup_size: " << wgs;
+    os << "\n" << indentStr.c_str() << "trans_per_block: " << bwd;
+    os << "\n" << indentStr.c_str() << "radices: [ ";
+    for(size_t i = 0; i < kernelFactors.size(); i++)
+    {
+        os << kernelFactors[i] << " ";
+    }
+    os << "]\n";
 }
 
 bool LeafNode::CreateDevKernelArgs()
@@ -122,15 +195,26 @@
     return (devKernArg != nullptr);
 }
 
-bool LeafNode::CreateTwiddleTableResource()
+bool LeafNode::CreateDeviceResources()
 {
+    if(need_chirp)
+    {
+        std::tie(chirp, chirp_size)
+            = Repo::GetChirp(lengthBlueN, precision, deviceProp.gcnArchName);
+    }
+
     if(need_twd_table)
     {
         if(!twd_no_radices)
             GetKernelFactors();
         size_t twd_len                    = GetTwiddleTableLength();
-        std::tie(twiddles, twiddles_size) = Repo::GetTwiddles1D(
-            twd_len, GetTwiddleTableLengthLimit(), precision, 0, twd_attach_halfN, kernelFactors);
+        std::tie(twiddles, twiddles_size) = Repo::GetTwiddles1D(twd_len,
+                                                                GetTwiddleTableLengthLimit(),
+                                                                precision,
+                                                                deviceProp.gcnArchName,
+                                                                0,
+                                                                twd_attach_halfN,
+                                                                kernelFactors);
     }
 
     return CreateLargeTwdTable();
@@ -141,11 +225,12 @@
     // derived classes setup the gp (bwd, wgs, lds, padding), funPtr
     SetupGPAndFnPtr_internal(fnPtr, gp);
 
+    auto key = GetKernelKey();
+
     // common: sum up the value;
-    gp.lds_bytes = (lds + lds_padding * bwd) * sizeof_precision(precision);
+    gp.lds_bytes = lds * complex_type_size(precision);
     if(scheme == CS_KERNEL_STOCKHAM && ebtype == EmbeddedType::NONE)
     {
-        auto key = fpkey(length[0], precision, scheme);
         if(function_pool::has_function(key))
         {
             auto kernel = function_pool::get_kernel(key);
@@ -167,10 +252,10 @@
                 gp.lds_bytes /= 2;
         }
     }
-    if((scheme == CS_KERNEL_STOCKHAM_BLOCK_CC || scheme == CS_KERNEL_STOCKHAM_BLOCK_CR)
+    // SBCC support half-lds conditionally
+    if((scheme == CS_KERNEL_STOCKHAM_BLOCK_CC)
        && (dir2regMode == DirectRegType::TRY_ENABLE_IF_SUPPORT) && (ebtype == EmbeddedType::NONE))
     {
-        auto key = fpkey(length[0], precision, scheme);
         if(function_pool::has_function(key))
         {
             auto kernel = function_pool::get_kernel(key);
@@ -178,7 +263,16 @@
                 gp.lds_bytes /= 2;
         }
     }
-    return;
+    // NB:
+    //   SBCR / SBRC are not able to use half-lds due to both of them can't satisfy dir-to/from-registers at them same time.
+
+    // Confirm that the requested LDS bytes will fit into what the
+    // device can provide.  If it can't, we've made a mistake in our
+    // computation somewhere.
+    if(gp.lds_bytes > deviceProp.sharedMemPerBlock)
+        throw std::runtime_error(std::to_string(gp.lds_bytes)
+                                 + " bytes of LDS requested, but device only provides "
+                                 + std::to_string(deviceProp.sharedMemPerBlock));
 }
 
 /*****************************************************
@@ -298,16 +392,25 @@
 
 bool TreeNode::IsBluesteinChirpSetup()
 {
-    // setup nodes must be under a bluestein parent
-    if(parent == nullptr || parent->scheme != CS_BLUESTEIN)
+    // setup nodes must be under a bluestein parent. multi-kernel fused
+    // bluestein is an exception to this rule as the first two chirp + padding
+    // nodes are under an L1D_CC node.
+    if(typeBlue != BT_MULTI_KERNEL_FUSED && (parent == nullptr || parent->scheme != CS_BLUESTEIN))
+        return false;
+    // bluestein could either be 3-kernel plan (so-called single kernel Bluestein),
+    // meaning the first two are setup kernels, or multi-kernel bluestein (fused or non-fused)
+    // where only the first is setup
+    switch(parent->typeBlue)
+    {
+    case BluesteinType::BT_NONE:
         return false;
-    // bluestein could either be 3-kernel plan, meaning the first two
-    // are setup kernels, or a 6 kernel plan where only the first is
-    // setup
-    if(parent->childNodes.size() == 3)
+    case BluesteinType::BT_SINGLE_KERNEL:
         return this == parent->childNodes[0].get() || this == parent->childNodes[1].get();
-    else if(parent->childNodes.size() == 6)
+    case BluesteinType::BT_MULTI_KERNEL:
         return this == parent->childNodes[0].get();
+    case BluesteinType::BT_MULTI_KERNEL_FUSED:
+        return (fuseBlue == BFT_FWD_CHIRP) ? true : false;
+    }
 
     throw std::runtime_error("unexpected bluestein plan shape");
 }
diff -Nru rocfft-5.5.0/library/src/tree_node_1D.cpp rocfft-5.7.1/library/src/tree_node_1D.cpp
--- rocfft-5.5.0/library/src/tree_node_1D.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/tree_node_1D.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -19,17 +19,21 @@
 // THE SOFTWARE.
 
 #include "tree_node_1D.h"
+#include "../../shared/precision_type.h"
 #include "../device/kernels/bank_shift.h"
 #include "function_pool.h"
 #include "fuse_shim.h"
 #include "node_factory.h"
+#include "tuning_helper.h"
 #include <numeric>
 
 /*****************************************************
  * L1D_TRTRT  *
  *****************************************************/
-void TRTRT1DNode::BuildTree_internal()
+void TRTRT1DNode::BuildTree_internal(const SchemeVec& child_schemes)
 {
+    bool noSolution = child_schemes.empty();
+
     size_t lenFactor1 = length.back();
     size_t lenFactor0 = length[0] / lenFactor1;
     if(lenFactor0 * lenFactor1 != length[0])
@@ -37,6 +41,8 @@
     length.pop_back();
 
     // first transpose
+    if(!noSolution)
+        assert(child_schemes[0] == CS_KERNEL_TRANSPOSE);
     auto trans1Plan = NodeFactory::CreateNodeFromScheme(CS_KERNEL_TRANSPOSE, this);
     trans1Plan->length.push_back(lenFactor0);
     trans1Plan->length.push_back(lenFactor1);
@@ -56,11 +62,16 @@
     {
         row1PlanData.length.push_back(length[index]);
     }
-    auto row1Plan     = NodeFactory::CreateExplicitNode(row1PlanData, this);
-    row1Plan->large1D = 0;
+
+    // skip the decide scheme part in node factory
+    ComputeScheme determined_scheme = (noSolution) ? CS_NONE : child_schemes[1];
+    auto          row1Plan = NodeFactory::CreateExplicitNode(row1PlanData, this, determined_scheme);
+    row1Plan->large1D      = 0;
     row1Plan->RecursiveBuildTree();
 
     // second transpose
+    if(!noSolution)
+        assert(child_schemes[2] == CS_KERNEL_TRANSPOSE);
     auto trans2Plan = NodeFactory::CreateNodeFromScheme(CS_KERNEL_TRANSPOSE, this);
     trans2Plan->length.push_back(lenFactor1);
     trans2Plan->length.push_back(lenFactor0);
@@ -73,6 +84,8 @@
     trans2Plan->SetTransposeOutputLength();
 
     // second row fft
+    if(!noSolution)
+        assert(child_schemes[3] == CS_KERNEL_STOCKHAM);
     auto row2Plan = NodeFactory::CreateNodeFromScheme(CS_KERNEL_STOCKHAM, this);
     row2Plan->length.push_back(lenFactor0);
     row2Plan->length.push_back(lenFactor1);
@@ -83,6 +96,8 @@
     }
 
     // third transpose
+    if(!noSolution)
+        assert(child_schemes[4] == CS_KERNEL_TRANSPOSE);
     auto trans3Plan = NodeFactory::CreateNodeFromScheme(CS_KERNEL_TRANSPOSE, this);
     trans3Plan->length.push_back(lenFactor0);
     trans3Plan->length.push_back(lenFactor1);
@@ -296,8 +311,10 @@
 /*****************************************************
  * L1D_CC  *
  *****************************************************/
-void CC1DNode::BuildTree_internal()
+void CC1DNode::BuildTree_internal(const SchemeVec& child_schemes)
 {
+    bool noSolution = child_schemes.empty();
+
     size_t lenFactor1 = length.back();
     size_t lenFactor0 = length[0] / lenFactor1;
     if(lenFactor0 * lenFactor1 != length[0])
@@ -305,18 +322,22 @@
     length.pop_back();
 
     // first plan, column-to-column
+    if(!noSolution)
+        assert(child_schemes[0] == CS_KERNEL_STOCKHAM_BLOCK_CC);
     auto col2colPlan = NodeFactory::CreateNodeFromScheme(CS_KERNEL_STOCKHAM_BLOCK_CC, this);
+
+    col2colPlan->typeBlue = typeBlue;
+    col2colPlan->fuseBlue = fuseBlue;
+    if(fuseBlue != BFT_NONE)
+    {
+        col2colPlan->lengthBlue  = lengthBlue;
+        col2colPlan->lengthBlueN = lengthBlueN;
+        if(fuseBlue == BFT_FWD_CHIRP || fuseBlue == BFT_FWD_CHIRP_MUL)
+            col2colPlan->need_chirp = true;
+    }
+
     // large1D flag to confirm we need multiply twiddle factor
     col2colPlan->large1D = length[0];
-    if(function_pool::has_SBCC_kernel(lenFactor1, precision))
-    {
-        // decompose the large twd table for L1D_CC
-        // exclude some exceptions that don't get benefit from 3-step LargeTwd (set in FFTKernel)
-        auto kernel
-            = function_pool::get_kernel(fpkey(lenFactor1, precision, CS_KERNEL_STOCKHAM_BLOCK_CC));
-        col2colPlan->largeTwd3Steps = kernel.use_3steps_large_twd;
-        col2colPlan->set_large_twd_base_steps(length[0]);
-    }
     col2colPlan->length.push_back(lenFactor1);
     col2colPlan->length.push_back(lenFactor0);
     col2colPlan->dimension = 1;
@@ -328,7 +349,20 @@
     std::swap(col2colPlan->outputLength[0], col2colPlan->outputLength[1]);
 
     // second plan, row-to-column
+    if(!noSolution)
+        assert(child_schemes[1] == CS_KERNEL_STOCKHAM_BLOCK_RC);
     auto row2colPlan = NodeFactory::CreateNodeFromScheme(CS_KERNEL_STOCKHAM_BLOCK_RC, this);
+
+    row2colPlan->typeBlue = typeBlue;
+    row2colPlan->fuseBlue = fuseBlue;
+    if(fuseBlue != BFT_NONE)
+    {
+        row2colPlan->lengthBlue  = lengthBlue;
+        row2colPlan->lengthBlueN = lengthBlueN;
+        if(fuseBlue == BFT_INV_CHIRP_MUL)
+            row2colPlan->need_chirp = true;
+    }
+
     row2colPlan->length.push_back(lenFactor0);
     row2colPlan->length.push_back(lenFactor1);
     row2colPlan->dimension = 1;
@@ -349,7 +383,10 @@
     auto& col2colPlan = childNodes[0];
     auto& row2colPlan = childNodes[1];
 
-    if((obOut == OB_USER_OUT) || (obOut == OB_TEMP_CMPLX_FOR_REAL))
+    assert(inStrideBlue.size() == outStrideBlue.size());
+    bool setBlueData = inStrideBlue.size();
+
+    if((obOut == OB_USER_OUT) || (obOut == OB_TEMP_CMPLX_FOR_REAL) || (obOut == OB_TEMP_BLUESTEIN))
     {
         // B -> T
         col2colPlan->inStride.push_back(inStride[0] * col2colPlan->length[1]);
@@ -360,11 +397,29 @@
         col2colPlan->outStride.push_back(1);
         col2colPlan->oDist = length[0];
 
+        if(setBlueData)
+        {
+            col2colPlan->outStrideBlue.push_back(col2colPlan->length[1]);
+            col2colPlan->outStrideBlue.push_back(1);
+            col2colPlan->oDistBlue = lengthBlue;
+
+            col2colPlan->inStrideBlue.push_back(inStrideBlue[0] * col2colPlan->length[1]);
+            col2colPlan->inStrideBlue.push_back(inStrideBlue[0]);
+            col2colPlan->iDistBlue = iDistBlue;
+        }
+
         for(size_t index = 1; index < length.size(); index++)
         {
             col2colPlan->inStride.push_back(inStride[index]);
             col2colPlan->outStride.push_back(col2colPlan->oDist);
             col2colPlan->oDist *= length[index];
+
+            if(setBlueData)
+            {
+                col2colPlan->inStrideBlue.push_back(inStrideBlue[index]);
+                col2colPlan->outStrideBlue.push_back(col2colPlan->oDistBlue);
+                col2colPlan->oDistBlue *= length[index];
+            }
         }
 
         // T -> B
@@ -376,11 +431,29 @@
         row2colPlan->outStride.push_back(outStride[0] * row2colPlan->length[1]);
         row2colPlan->oDist = oDist;
 
+        if(setBlueData)
+        {
+            row2colPlan->inStrideBlue.push_back(1);
+            row2colPlan->inStrideBlue.push_back(row2colPlan->length[0]);
+            row2colPlan->iDistBlue = lengthBlue;
+
+            row2colPlan->outStrideBlue.push_back(outStrideBlue[0]);
+            row2colPlan->outStrideBlue.push_back(outStrideBlue[0] * row2colPlan->length[1]);
+            row2colPlan->oDistBlue = oDistBlue;
+        }
+
         for(size_t index = 1; index < length.size(); index++)
         {
             row2colPlan->inStride.push_back(row2colPlan->iDist);
             row2colPlan->iDist *= length[index];
             row2colPlan->outStride.push_back(outStride[index]);
+
+            if(setBlueData)
+            {
+                row2colPlan->inStrideBlue.push_back(row2colPlan->iDistBlue);
+                row2colPlan->iDistBlue *= length[index];
+                row2colPlan->outStrideBlue.push_back(outStrideBlue[index]);
+            }
         }
     }
     else
@@ -396,9 +469,21 @@
         col2colPlan->inStride.push_back(inStride[0]);
         col2colPlan->iDist = iDist;
 
+        if(setBlueData)
+        {
+            col2colPlan->inStrideBlue.push_back(inStrideBlue[0] * col2colPlan->length[1]);
+            col2colPlan->inStrideBlue.push_back(inStrideBlue[0]);
+            col2colPlan->iDistBlue = iDistBlue;
+        }
+
         for(size_t index = 1; index < length.size(); index++)
+        {
             col2colPlan->inStride.push_back(inStride[index]);
 
+            if(setBlueData)
+                col2colPlan->inStrideBlue.push_back(inStrideBlue[index]);
+        }
+
         if(parent->scheme == CS_L1D_TRTRT)
         {
             col2colPlan->outStride.push_back(parent->outStride[0] * col2colPlan->length[1]);
@@ -407,23 +492,58 @@
                                              * col2colPlan->length[0]);
             col2colPlan->oDist = parent->oDist;
 
+            if(setBlueData)
+            {
+                col2colPlan->outStrideBlue.push_back(parent->outStrideBlue[0]
+                                                     * col2colPlan->length[1]);
+                col2colPlan->outStrideBlue.push_back(parent->outStrideBlue[0]);
+                col2colPlan->outStrideBlue.push_back(
+                    parent->outStrideBlue[0] * col2colPlan->length[1] * col2colPlan->length[0]);
+                col2colPlan->oDistBlue = parent->oDistBlue;
+            }
+
             for(size_t index = 1; index < parent->length.size(); index++)
+            {
                 col2colPlan->outStride.push_back(parent->outStride[index]);
+
+                if(setBlueData)
+                    col2colPlan->outStrideBlue.push_back(parent->outStrideBlue[index]);
+            }
         }
         else
         {
             // we dont have B info here, need to assume packed data and descended
             // from 2D/3D
-            assert(parent->outStride[0] == 1);
+            //assert(parent->outStride[0] == 1);
+            //assert(parent->outStrideBlue[0] == 1);
 
             col2colPlan->outStride.push_back(col2colPlan->length[1]);
             col2colPlan->outStride.push_back(1);
-            col2colPlan->oDist = col2colPlan->length[1] * col2colPlan->length[0];
+
+            if(setBlueData)
+            {
+                col2colPlan->outStrideBlue.push_back(col2colPlan->length[1]);
+                col2colPlan->outStrideBlue.push_back(1);
+            }
+
+            if(fuseBlue != BFT_NONE)
+            {
+                col2colPlan->oDist     = lengthBlueN;
+                col2colPlan->oDistBlue = lengthBlue;
+            }
+            else
+                col2colPlan->oDist = col2colPlan->length[1] * col2colPlan->length[0];
 
             for(size_t index = 1; index < length.size(); index++)
             {
                 col2colPlan->outStride.push_back(col2colPlan->oDist);
                 col2colPlan->oDist *= length[index];
+
+                if(setBlueData)
+                {
+                    col2colPlan->outStrideBlue.push_back(col2colPlan->oDistBlue);
+                    col2colPlan->oDistBlue *= length[index];
+                }
             }
         }
 
@@ -436,8 +556,23 @@
                                             * row2colPlan->length[1]);
             row2colPlan->iDist = parent->oDist;
 
+            if(setBlueData)
+            {
+                row2colPlan->inStrideBlue.push_back(parent->outStrideBlue[0]);
+                row2colPlan->inStrideBlue.push_back(parent->outStrideBlue[0]
+                                                    * row2colPlan->length[0]);
+                row2colPlan->inStrideBlue.push_back(
+                    parent->outStrideBlue[0] * row2colPlan->length[0] * row2colPlan->length[1]);
+                row2colPlan->iDistBlue = parent->oDistBlue;
+            }
+
             for(size_t index = 1; index < parent->length.size(); index++)
+            {
                 row2colPlan->inStride.push_back(parent->outStride[index]);
+
+                if(setBlueData)
+                    row2colPlan->inStrideBlue.push_back(parent->outStrideBlue[index]);
+            }
         }
         else
         {
@@ -445,12 +580,31 @@
             // from 2D/3D
             row2colPlan->inStride.push_back(1);
             row2colPlan->inStride.push_back(row2colPlan->length[0]);
-            row2colPlan->iDist = row2colPlan->length[0] * row2colPlan->length[1];
+
+            if(setBlueData)
+            {
+                row2colPlan->inStrideBlue.push_back(1);
+                row2colPlan->inStrideBlue.push_back(row2colPlan->length[0]);
+            }
+
+            if(fuseBlue != BFT_NONE)
+            {
+                row2colPlan->iDist     = lengthBlueN;
+                row2colPlan->iDistBlue = lengthBlue;
+            }
+            else
+                row2colPlan->iDist = row2colPlan->length[0] * row2colPlan->length[1];
 
             for(size_t index = 1; index < length.size(); index++)
             {
                 row2colPlan->inStride.push_back(row2colPlan->iDist);
                 row2colPlan->iDist *= length[index];
+
+                if(setBlueData)
+                {
+                    row2colPlan->inStrideBlue.push_back(row2colPlan->iDistBlue);
+                    row2colPlan->iDistBlue *= length[index];
+                }
             }
         }
 
@@ -458,8 +612,20 @@
         row2colPlan->outStride.push_back(outStride[0] * row2colPlan->length[1]);
         row2colPlan->oDist = oDist;
 
+        if(setBlueData)
+        {
+            row2colPlan->outStrideBlue.push_back(outStrideBlue[0]);
+            row2colPlan->outStrideBlue.push_back(outStrideBlue[0] * row2colPlan->length[1]);
+            row2colPlan->oDistBlue = oDistBlue;
+        }
+
         for(size_t index = 1; index < length.size(); index++)
+        {
             row2colPlan->outStride.push_back(outStride[index]);
+
+            if(setBlueData)
+                row2colPlan->outStrideBlue.push_back(outStrideBlue[index]);
+        }
     }
 
     // special case for strided large 1D FFT with dist 1
@@ -497,8 +663,10 @@
 /*****************************************************
  * L1D_CRT  *
  *****************************************************/
-void CRT1DNode::BuildTree_internal()
+void CRT1DNode::BuildTree_internal(const SchemeVec& child_schemes)
 {
+    bool noSolution = child_schemes.empty();
+
     size_t lenFactor1 = length.back();
     size_t lenFactor0 = length[0] / lenFactor1;
     if(lenFactor0 * lenFactor1 != length[0])
@@ -506,18 +674,11 @@
     length.pop_back();
 
     // first plan, column-to-column
+    if(!noSolution)
+        assert(child_schemes[0] == CS_KERNEL_STOCKHAM_BLOCK_CC);
     auto col2colPlan = NodeFactory::CreateNodeFromScheme(CS_KERNEL_STOCKHAM_BLOCK_CC, this);
     // large1D flag to confirm we need multiply twiddle factor
     col2colPlan->large1D = length[0];
-    if(function_pool::has_SBCC_kernel(lenFactor1, precision))
-    {
-        // decompose the large twd table for L1D_CRT
-        // exclude some exceptions that don't get benefit from 3-step LargeTwd (set in FFTKernel)
-        auto kernel
-            = function_pool::get_kernel(fpkey(lenFactor1, precision, CS_KERNEL_STOCKHAM_BLOCK_CC));
-        col2colPlan->largeTwd3Steps = kernel.use_3steps_large_twd;
-        col2colPlan->set_large_twd_base_steps(length[0]);
-    }
     col2colPlan->length.push_back(lenFactor1);
     col2colPlan->length.push_back(lenFactor0);
     col2colPlan->dimension = 1;
@@ -529,6 +690,8 @@
     std::swap(col2colPlan->outputLength[0], col2colPlan->outputLength[1]);
 
     // second plan, row-to-row
+    if(!noSolution)
+        assert(child_schemes[1] == CS_KERNEL_STOCKHAM);
     auto row2rowPlan = NodeFactory::CreateNodeFromScheme(CS_KERNEL_STOCKHAM, this);
     row2rowPlan->length.push_back(lenFactor0);
     row2rowPlan->length.push_back(lenFactor1);
@@ -541,6 +704,8 @@
     // row2rowPlan->allowOutofplace = false;
 
     // third plan, transpose
+    if(!noSolution)
+        assert(child_schemes[2] == CS_KERNEL_TRANSPOSE);
     auto transPlan = NodeFactory::CreateNodeFromScheme(CS_KERNEL_TRANSPOSE, this);
     transPlan->length.push_back(lenFactor0);
     transPlan->length.push_back(lenFactor1);
@@ -716,15 +881,17 @@
     for(size_t j = 1; j < length.size(); j++)
         batch_accum *= length[j];
 
-    auto kernel = function_pool::get_kernel(fpkey(length[0], precision));
+    auto key    = GetKernelKey();
+    auto kernel = function_pool::get_kernel(key);
     fnPtr       = kernel.device_function;
 
     if(ebtype != EmbeddedType::NONE)
         lds_padding = 1;
 
     bwd      = kernel.transforms_per_block;
+    wgs      = kernel.workgroup_size;
     gp.b_x   = (batch_accum + bwd - 1) / bwd;
-    gp.wgs_x = kernel.workgroup_size;
+    gp.wgs_x = wgs;
 
     // we don't even need lds (kernel_1,2,3,4,5,6,7,10,11,13,17) since we don't use them at all
     // TODO: we can even use swizzle to do the butterfly shuffle if threads_per_transform[0] <= warpSize
@@ -749,14 +916,18 @@
     }
 }
 
-bool Stockham1DNode::CreateTwiddleTableResource()
+bool Stockham1DNode::CreateDeviceResources()
 {
     twd_attach_halfN = (ebtype != EmbeddedType::NONE);
-    return LeafNode::CreateTwiddleTableResource();
+    return LeafNode::CreateDeviceResources();
 }
 
 std::vector<size_t> Stockham1DNode::CollapsibleDims()
 {
+    // do not collapse on multi-kernel fused Bluestein nodes
+    if(typeBlue == BT_MULTI_KERNEL_FUSED)
+        return {};
+
     // fastest dim is FFT, the rest is collapsible
     std::vector<size_t> ret(length.size() - 1);
     std::iota(ret.begin(), ret.end(), 1);
@@ -766,21 +937,49 @@
 /*****************************************************
  * SBCC  *
  *****************************************************/
-bool SBCCNode::KernelCheck()
+bool SBCCNode::KernelCheck(std::vector<FMKey>& kernel_keys)
 {
-    bool res = LeafNode::KernelCheck();
+    bool res = LeafNode::KernelCheck(kernel_keys);
+    if(!res)
+        return false;
 
-    // set according to benchmark
-    SetDirectRegType();
+    if(large1D > 0)
+    {
+        FMKey key      = GetKernelKey();
+        auto  kernel   = function_pool::get_kernel(key);
+        largeTwd3Steps = kernel.use_3steps_large_twd;
+        get_large_twd_base_steps(large1D, largeTwd3Steps, largeTwdBase, ltwdSteps);
+    }
 
-    // set according to benchmark
-    SetIntrinsicMode();
+    // if we are doing tuning or running with the tuned solution, we have the specified_key.
+    // we must directly run the kernel with the exact setting as the config
+    // without the hardcoded tuning
+    if(specified_key != nullptr)
+    {
+        InitIntrinsicMode();
+        return true;
+    }
+
+    // hardocded-tuning according to benchmark
+    TuneDirectRegType();
+
+    // check if we can use buffer instr
+    InitIntrinsicMode();
+    // hardocded-tuning according to benchmark
+    TuneIntrinsicMode();
 
-    return res;
+    return true;
 }
 
-void SBCCNode::SetDirectRegType()
+void SBCCNode::TuneDirectRegType()
 {
+    // half precision has not been tested yet, disable it for now.
+    if(precision == rocfft_precision_half)
+    {
+        dir2regMode = FORCE_OFF_OR_NOT_SUPPORT;
+        return;
+    }
+
     // for Navi, Haven't tested all.
     if(is_device_gcn_arch(deviceProp, "gfx1030"))
     {
@@ -796,7 +995,7 @@
         std::map<rocfft_precision, std::set<size_t>> exceptions
             = {{rocfft_precision_single, {125, 192, 216, 224, 240, 243}},
                {rocfft_precision_double, {224, 343}}};
-        if(exceptions.at(precision).count(length[0]))
+        if(length_excepted(exceptions, precision, length[0]))
             dir2regMode = FORCE_OFF_OR_NOT_SUPPORT;
     }
     else if(is_device_gcn_arch(deviceProp, "gfx90a"))
@@ -807,69 +1006,99 @@
         std::map<rocfft_precision, std::set<size_t>> exceptions
             = {{rocfft_precision_single, {125, 192, 216, 200, 224, 240}},
                {rocfft_precision_double, {125, 224, 243}}};
-        if(exceptions.at(precision).count(length[0]))
+        if(length_excepted(exceptions, precision, length[0]))
             dir2regMode = FORCE_OFF_OR_NOT_SUPPORT;
     }
 }
 
-void SBCCNode::SetIntrinsicMode()
+void SBCCNode::InitIntrinsicMode()
 {
-    // NB: remember set this value at this point instead of SetupGPAndFnPtr_internal()
-    //     since we might need to pass this value to RTC generator
-    intrinsicMode = IntrinsicAccessType::DISABLE_BOTH;
-
-    // TODO- To test on gfx90a
-    if((is_device_gcn_arch(deviceProp, "gfx906") || is_device_gcn_arch(deviceProp, "gfx908")
-        || is_device_gcn_arch(deviceProp, "gfx1030"))
-       && (dir2regMode == TRY_ENABLE_IF_SUPPORT))
-    {
-        // General rejections: cases we can't use buffer load
-        if(((uint64_t)iDist * batch * sizeof_precision(precision) < 0xFFFFFFFF)
-           && ((uint64_t)oDist * batch * sizeof_precision(precision) < 0xFFFFFFFF))
-        {
-            if(placement == rocfft_placement_inplace)
-                intrinsicMode = IntrinsicAccessType::ENABLE_LOAD_ONLY;
-            else
-                intrinsicMode = IntrinsicAccessType::ENABLE_BOTH;
-        }
+    // 1. General rejections: (Guard) cases we definitely can't use buffer instruction
+    // 2. half precision has not been tested yet, disable it for now.
+    if(((uint64_t)iDist * batch * complex_type_size(precision) >= 0xFFFFFFFF)
+       || ((uint64_t)oDist * batch * complex_type_size(precision) >= 0xFFFFFFFF)
+       || (precision == rocfft_precision_half))
+    {
+        intrinsicMode = IntrinsicAccessType::DISABLE_BOTH;
+        return;
+    }
 
-        // Based on benchmark results
-        if(is_device_gcn_arch(deviceProp, "gfx906"))
-        {
-            // bad results from benchmark:
-            // {96,sp}, {125,sp}, {192,sp/dp}, {240,dp}, {256,sp/dp}, {343,sp/dp}
-            std::map<rocfft_precision, std::set<size_t>> exceptions
-                = {{rocfft_precision_single, {96, 125, 192, 256, 343}},
-                   {rocfft_precision_double, {192, 240, 256, 343}}};
-            if(exceptions.at(precision).count(length[0]))
-                intrinsicMode = IntrinsicAccessType::DISABLE_BOTH;
-        }
-        else if(is_device_gcn_arch(deviceProp, "gfx908"))
-        {
-            // bad results from benchmark:
-            // {104,sp/dp}, {192,dp}, {240,dp}, {289,sp}
-            std::map<rocfft_precision, std::set<size_t>> exceptions = {
-                {rocfft_precision_single, {104, 289}}, {rocfft_precision_double, {104, 192, 240}}};
-            if(exceptions.at(precision).count(length[0]))
-                intrinsicMode = IntrinsicAccessType::DISABLE_BOTH;
-        }
+    // case 1: is runing tuning or a tuned solution, then use the setting in the config
+    if(specified_key != nullptr)
+    {
+        auto& config  = std::get<4>(*specified_key.get());
+        intrinsicMode = (config.intrinsic_buffer_inst) ? IntrinsicAccessType::ENABLE_BOTH
+                                                       : IntrinsicAccessType::DISABLE_BOTH;
+        return;
+    }
+
+    // case 2: un-tuned: auto decision: try to use buffer instruction as possible
+    if(dir2regMode == TRY_ENABLE_IF_SUPPORT)
+    {
+        if(placement == rocfft_placement_inplace)
+            intrinsicMode = IntrinsicAccessType::ENABLE_LOAD_ONLY;
+        else
+            intrinsicMode = IntrinsicAccessType::ENABLE_BOTH;
+    }
+}
+
+// NB: remember set this value at this point instead of SetupGPAndFnPtr_internal()
+//     since we might need to pass this value to RTC generator
+void SBCCNode::TuneIntrinsicMode()
+{
+    // already disabled
+    if(intrinsicMode == IntrinsicAccessType::DISABLE_BOTH)
+        return;
+
+    // hardcoded turn-off in some exception cases
+    // 1. currently we only enable this on 906, 908, 1030. TODO- test on 90a
+    if((is_device_gcn_arch(deviceProp, "gfx906") == false)
+       && (is_device_gcn_arch(deviceProp, "gfx908") == false)
+       && (is_device_gcn_arch(deviceProp, "gfx1030") == false))
+    {
+        intrinsicMode = IntrinsicAccessType::DISABLE_BOTH;
+    }
+    // 2. exception cases on 906. Based on benchmark results
+    else if(is_device_gcn_arch(deviceProp, "gfx906"))
+    {
+        // bad results from benchmark:
+        // {96,sp}, {125,sp}, {192,sp/dp}, {240,dp}, {256,sp/dp}, {343,sp/dp}
+        std::map<rocfft_precision, std::set<size_t>> exceptions
+            = {{rocfft_precision_single, {96, 125, 192, 256, 343}},
+               {rocfft_precision_double, {192, 240, 256, 343}}};
+        if(length_excepted(exceptions, precision, length[0]))
+            intrinsicMode = IntrinsicAccessType::DISABLE_BOTH;
+    }
+    // 3. exception cases on 908. Based on benchmark results
+    else if(is_device_gcn_arch(deviceProp, "gfx908"))
+    {
+        // bad results from benchmark:
+        // {104,sp/dp}, {192,dp}, {240,dp}, {289,sp}
+        std::map<rocfft_precision, std::set<size_t>> exceptions
+            = {{rocfft_precision_single, {104, 289}}, {rocfft_precision_double, {104, 192, 240}}};
+        if(length_excepted(exceptions, precision, length[0]))
+            intrinsicMode = IntrinsicAccessType::DISABLE_BOTH;
     }
 }
 
 void SBCCNode::SetupGPAndFnPtr_internal(DevFnCall& fnPtr, GridParam& gp)
 {
-    auto kernel = function_pool::get_kernel(fpkey(length[0], precision, scheme));
+    auto kernel = function_pool::get_kernel(GetKernelKey());
     fnPtr       = kernel.device_function;
     bwd         = kernel.transforms_per_block;
     wgs         = kernel.workgroup_size;
     lds         = length[0] * bwd;
     gp.b_x      = ((length[1]) - 1) / bwd + 1;
     gp.b_x *= std::accumulate(length.begin() + 2, length.end(), batch, std::multiplies<size_t>());
-    gp.wgs_x = kernel.workgroup_size;
+    gp.wgs_x = wgs;
 }
 
 std::vector<size_t> SBCCNode::CollapsibleDims()
 {
+    // do not collapse on multi-kernel fused Bluestein nodes
+    if(typeBlue == BT_MULTI_KERNEL_FUSED)
+        return {};
+
     // second-fastest dim is FFT, higher dims are collapsible
     std::vector<size_t> ret(length.size() - 2);
     std::iota(ret.begin(), ret.end(), 2);
@@ -879,25 +1108,63 @@
 /*****************************************************
  * SBRC  *
  *****************************************************/
-bool SBRCNode::KernelCheck()
+FMKey SBRCNode::GetKernelKey() const
 {
-    bool res = LeafNode::KernelCheck();
+    if(specified_key)
+        return *specified_key.get();
 
-    // set according to benchmark
-    SetDirectRegType();
+    // NB: Need to make sure that sbrcTranstype has the correct value
+    if(sbrcTranstype == SBRC_TRANSPOSE_TYPE::NONE)
+    {
+        // find the base kernel at first
+        FMKey baseKey = fpkey(length[0], precision, scheme, TILE_ALIGNED);
+        // if we have the base kernel, then we set the exact sbrc_trans_type and return the real key
+        // if we don't, then we simply return a key with NONE sbrc_trans_type
+        // which will make KernelCheck() trigger an exception
+        if(function_pool::has_function(baseKey))
+        {
+            auto bwd      = function_pool::get_kernel(baseKey).transforms_per_block;
+            sbrcTranstype = sbrc_transpose_type(bwd);
+        }
+    }
 
-    return res;
+    return fpkey(length[0], precision, scheme, sbrcTranstype);
 }
 
-void SBRCNode::SetDirectRegType()
+bool SBRCNode::KernelCheck(std::vector<FMKey>& kernel_keys)
 {
+    bool res = LeafNode::KernelCheck(kernel_keys);
+    if(!res)
+        return false;
+
+    // if we are doing tuning or running with the tuned solution, we have the specified_key.
+    // we must directly run the kernel with the exact setting as the config
+    // without the hardcoded tuning
+    if(specified_key != nullptr)
+        return true;
+
+    // hardocded-tuning according to benchmark
+    TuneDirectRegType();
+
+    return true;
+}
+
+void SBRCNode::TuneDirectRegType()
+{
+    // half precision has not been tested yet, disable it for now.
+    if(precision == rocfft_precision_half)
+    {
+        dir2regMode = FORCE_OFF_OR_NOT_SUPPORT;
+        return;
+    }
+
     if(is_device_gcn_arch(deviceProp, "gfx906"))
     {
         // bad results from benchmark:
         // {49,sp}, {128,sp}, {64,dp}, {81,dp}, {100,dp}
         std::map<rocfft_precision, std::set<size_t>> exceptions
             = {{rocfft_precision_single, {49, 128}}, {rocfft_precision_double, {64, 81, 100}}};
-        if(exceptions.at(precision).count(length[0]))
+        if(length_excepted(exceptions, precision, length[0]))
             dir2regMode = FORCE_OFF_OR_NOT_SUPPORT;
     }
     else if(is_device_gcn_arch(deviceProp, "gfx908"))
@@ -907,7 +1174,7 @@
         std::map<rocfft_precision, std::set<size_t>> exceptions
             = {{rocfft_precision_single, {81, 100, 128, 192, 200, 512}},
                {rocfft_precision_double, {125, 128}}};
-        if(exceptions.at(precision).count(length[0]))
+        if(length_excepted(exceptions, precision, length[0]))
             dir2regMode = FORCE_OFF_OR_NOT_SUPPORT;
     }
     else if(is_device_gcn_arch(deviceProp, "gfx90a"))
@@ -917,7 +1184,7 @@
         std::map<rocfft_precision, std::set<size_t>> exceptions
             = {{rocfft_precision_single, {49, 81, 100, 125, 200, 512}},
                {rocfft_precision_double, {64, 81, 100, 125}}};
-        if(exceptions.at(precision).count(length[0]))
+        if(length_excepted(exceptions, precision, length[0]))
             dir2regMode = FORCE_OFF_OR_NOT_SUPPORT;
     }
     // we don't enable the features for others
@@ -929,42 +1196,61 @@
 
 void SBRCNode::SetupGPAndFnPtr_internal(DevFnCall& fnPtr, GridParam& gp)
 {
-    auto kernel   = function_pool::get_kernel(fpkey(length[0], precision, scheme));
-    bwd           = kernel.transforms_per_block;
-    wgs           = kernel.workgroup_size;
-    lds           = length[0] * bwd;
-    sbrcTranstype = sbrc_transpose_type(bwd);
-    fnPtr         = function_pool::get_function(fpkey(length[0], precision, scheme, sbrcTranstype));
-    gp.b_x        = (length[1] - 1) / bwd + 1;
+    // sbrcTransType has already been assigned in KernelCheck();
+    auto kernel = function_pool::get_kernel(GetKernelKey());
+    fnPtr       = kernel.device_function;
+    bwd         = kernel.transforms_per_block;
+    wgs         = kernel.workgroup_size;
+    lds         = length[0] * bwd;
+    gp.b_x      = (length[1] - 1) / bwd + 1;
     gp.b_x *= std::accumulate(length.begin() + 2, length.end(), batch, std::multiplies<size_t>());
-    gp.wgs_x = kernel.workgroup_size;
+    gp.wgs_x = wgs;
 }
 
 SBRC_TRANSPOSE_TYPE SBRCNode::sbrc_transpose_type(unsigned int blockWidth) const
 {
-    // NB: Since we need a NONE as default, so this NONE is actually TILE_ALIGNED
     auto alignment_dimension = length[1];
-    return (alignment_dimension % blockWidth == 0) ? NONE : TILE_UNALIGNED;
+    return (alignment_dimension % blockWidth == 0) ? TILE_ALIGNED : TILE_UNALIGNED;
 }
 
 /*****************************************************
  * SBCR  *
  *****************************************************/
-bool SBCRNode::KernelCheck()
+bool SBCRNode::KernelCheck(std::vector<FMKey>& kernel_keys)
 {
-    bool res = LeafNode::KernelCheck();
+    bool res = LeafNode::KernelCheck(kernel_keys);
+    if(!res)
+        return false;
+
+    // if we are doing tuning or running with the tuned solution, we have the specified_key.
+    // we must directly run the kernel with the exact setting as the config
+    // without the hardcoded tuning
+    if(specified_key != nullptr)
+    {
+        InitIntrinsicMode();
+        return true;
+    }
 
-    // set according to benchmark
-    SetDirectRegType();
+    // hardocded-tuning according to benchmark
+    TuneDirectRegType();
 
-    // set according to benchmark
-    SetIntrinsicMode();
+    // check if we can use buffer instr
+    InitIntrinsicMode();
+    // hardocded-tuning according to benchmark
+    TuneIntrinsicMode();
 
-    return res;
+    return true;
 }
 
-void SBCRNode::SetDirectRegType()
+void SBCRNode::TuneDirectRegType()
 {
+    // half precision has not been tested yet, disable it for now.
+    if(precision == rocfft_precision_half)
+    {
+        dir2regMode = FORCE_OFF_OR_NOT_SUPPORT;
+        return;
+    }
+
     // switch on/off according to the arch
     // tweaking the setting based on the benchmark results.
 
@@ -982,33 +1268,60 @@
     }
 }
 
-void SBCRNode::SetIntrinsicMode()
+void SBCRNode::InitIntrinsicMode()
 {
-    // NB: remember set this value at this point instead of SetupGPAndFnPtr_internal()
-    //     since we might need to pass this value to RTC generator
-    intrinsicMode = IntrinsicAccessType::DISABLE_BOTH;
+    // 1. General rejections: (Guard) cases we definitely can't use buffer instruction
+    // 2. half precision has not been tested yet, disable it for now.
+    if(((uint64_t)iDist * batch * complex_type_size(precision) >= 0xFFFFFFFF)
+       || ((uint64_t)oDist * batch * complex_type_size(precision) >= 0xFFFFFFFF)
+       || (precision == rocfft_precision_half))
+    {
+        intrinsicMode = IntrinsicAccessType::DISABLE_BOTH;
+        return;
+    }
 
-    // TODO- To test on gfx90a
-    if(is_device_gcn_arch(deviceProp, "gfx908") && (dir2regMode == TRY_ENABLE_IF_SUPPORT))
+    // case 1: is runing tuning or a tuned solution, then use the setting in the config
+    if(specified_key != nullptr)
     {
-        if(((uint64_t)iDist * batch * sizeof_precision(precision) < 0xFFFFFFFF)
-           && ((uint64_t)oDist * batch * sizeof_precision(precision) < 0xFFFFFFFF))
-        {
-            intrinsicMode = IntrinsicAccessType::ENABLE_BOTH;
-        }
+        auto& config  = std::get<4>(*specified_key.get());
+        intrinsicMode = (config.intrinsic_buffer_inst) ? IntrinsicAccessType::ENABLE_BOTH
+                                                       : IntrinsicAccessType::DISABLE_BOTH;
+        return;
+    }
+
+    // case 2: un-tuned: auto decision: try to use buffer instruction as possible
+    if(dir2regMode == TRY_ENABLE_IF_SUPPORT)
+    {
+        intrinsicMode = IntrinsicAccessType::ENABLE_BOTH;
+    }
+}
+
+// NB: remember set this value at this point instead of SetupGPAndFnPtr_internal()
+//     since we might need to pass this value to RTC generator
+void SBCRNode::TuneIntrinsicMode()
+{
+    // already disabled
+    if(intrinsicMode == IntrinsicAccessType::DISABLE_BOTH)
+        return;
+
+    // hardcoded turn-off in some exception cases
+    // 1. currently we only enable this on 908. TODO- test on 90a
+    if(is_device_gcn_arch(deviceProp, "gfx908") == false)
+    {
+        intrinsicMode = IntrinsicAccessType::DISABLE_BOTH;
     }
 }
 
 void SBCRNode::SetupGPAndFnPtr_internal(DevFnCall& fnPtr, GridParam& gp)
 {
-    auto kernel = function_pool::get_kernel(fpkey(length[0], precision, scheme));
+    auto kernel = function_pool::get_kernel(GetKernelKey());
     fnPtr       = kernel.device_function;
     wgs         = kernel.workgroup_size;
     bwd         = kernel.transforms_per_block;
     lds         = length[0] * bwd;
     gp.b_x      = ((length[1]) - 1) / bwd + 1;
     gp.b_x *= std::accumulate(length.begin() + 2, length.end(), batch, std::multiplies<size_t>());
-    gp.wgs_x = kernel.workgroup_size;
+    gp.wgs_x = wgs;
 
     if(ebtype != EmbeddedType::NONE)
         lds_padding = 1;
@@ -1016,8 +1329,8 @@
     return;
 }
 
-bool SBCRNode::CreateTwiddleTableResource()
+bool SBCRNode::CreateDeviceResources()
 {
     twd_attach_halfN = (ebtype != EmbeddedType::NONE);
-    return LeafNode::CreateTwiddleTableResource();
+    return LeafNode::CreateDeviceResources();
 }
diff -Nru rocfft-5.5.0/library/src/tree_node_2D.cpp rocfft-5.7.1/library/src/tree_node_2D.cpp
--- rocfft-5.5.0/library/src/tree_node_2D.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/tree_node_2D.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -28,8 +28,10 @@
 /*****************************************************
  * 2D_RTRT  *
  *****************************************************/
-void RTRT2DNode::BuildTree_internal()
+void RTRT2DNode::BuildTree_internal(const SchemeVec& child_schemes)
 {
+    bool noSolution = child_schemes.empty();
+
     // first row fft
     NodeMetaData row1PlanData(this);
     row1PlanData.length.push_back(length[0]);
@@ -39,10 +41,14 @@
     {
         row1PlanData.length.push_back(length[index]);
     }
-    auto row1Plan = NodeFactory::CreateExplicitNode(row1PlanData, this);
+    // skip the decide scheme part in node factory
+    ComputeScheme determined_scheme = (noSolution) ? CS_NONE : child_schemes[0];
+    auto          row1Plan = NodeFactory::CreateExplicitNode(row1PlanData, this, determined_scheme);
     row1Plan->RecursiveBuildTree();
 
     // first transpose
+    if(!noSolution)
+        assert(child_schemes[1] == CS_KERNEL_TRANSPOSE);
     auto trans1Plan = NodeFactory::CreateNodeFromScheme(CS_KERNEL_TRANSPOSE, this);
     trans1Plan->length.push_back(length[0]);
     trans1Plan->length.push_back(length[1]);
@@ -62,10 +68,14 @@
     {
         row2PlanData.length.push_back(length[index]);
     }
-    auto row2Plan = NodeFactory::CreateExplicitNode(row2PlanData, this);
+    // skip the decide scheme part in node factory
+    determined_scheme = (noSolution) ? CS_NONE : child_schemes[2];
+    auto row2Plan     = NodeFactory::CreateExplicitNode(row2PlanData, this, determined_scheme);
     row2Plan->RecursiveBuildTree();
 
     // second transpose
+    if(!noSolution)
+        assert(child_schemes[3] == CS_KERNEL_TRANSPOSE);
     auto trans2Plan = NodeFactory::CreateNodeFromScheme(CS_KERNEL_TRANSPOSE, this);
     trans2Plan->length.push_back(length[1]);
     trans2Plan->length.push_back(length[0]);
@@ -100,11 +110,23 @@
 
 void RTRT2DNode::AssignParams_internal()
 {
+    assert(inStrideBlue.size() == outStrideBlue.size());
+    bool setBlueData = inStrideBlue.size();
+
     auto& row1Plan      = childNodes[0];
     row1Plan->inStride  = inStride;
     row1Plan->iDist     = iDist;
     row1Plan->outStride = outStride;
     row1Plan->oDist     = oDist;
+
+    if(setBlueData)
+    {
+        row1Plan->inStrideBlue  = inStrideBlue;
+        row1Plan->iDistBlue     = iDistBlue;
+        row1Plan->outStrideBlue = outStrideBlue;
+        row1Plan->oDistBlue     = oDistBlue;
+    }
+
     row1Plan->AssignParams();
 
     auto& trans1Plan     = childNodes[1];
@@ -113,10 +135,26 @@
     trans1Plan->outStride.push_back(trans1Plan->length[1]);
     trans1Plan->outStride.push_back(1);
     trans1Plan->oDist = trans1Plan->length[0] * trans1Plan->outStride[0];
+
+    if(setBlueData)
+    {
+        trans1Plan->inStrideBlue = row1Plan->outStrideBlue;
+        trans1Plan->iDistBlue    = row1Plan->oDistBlue;
+        trans1Plan->outStrideBlue.push_back(trans1Plan->length[1]);
+        trans1Plan->outStrideBlue.push_back(1);
+        trans1Plan->oDistBlue = trans1Plan->length[0] * trans1Plan->outStrideBlue[0];
+    }
+
     for(size_t index = 2; index < length.size(); index++)
     {
         trans1Plan->outStride.push_back(trans1Plan->oDist);
         trans1Plan->oDist *= length[index];
+
+        if(setBlueData)
+        {
+            trans1Plan->outStrideBlue.push_back(trans1Plan->oDistBlue);
+            trans1Plan->oDistBlue *= length[index];
+        }
     }
 
     auto& row2Plan     = childNodes[2];
@@ -125,6 +163,15 @@
     row2Plan->iDist     = trans1Plan->oDist;
     row2Plan->outStride = row2Plan->inStride;
     row2Plan->oDist     = row2Plan->iDist;
+    if(setBlueData)
+    {
+        row2Plan->inStrideBlue = trans1Plan->outStrideBlue;
+        std::swap(row2Plan->inStrideBlue[0], row2Plan->inStrideBlue[1]);
+        row2Plan->iDistBlue     = trans1Plan->oDistBlue;
+        row2Plan->outStrideBlue = row2Plan->inStrideBlue;
+        row2Plan->oDistBlue     = row2Plan->iDistBlue;
+    }
+
     row2Plan->AssignParams();
 
     auto& trans2Plan      = childNodes[3];
@@ -133,13 +180,24 @@
     trans2Plan->outStride = outStride;
     std::swap(trans2Plan->outStride[0], trans2Plan->outStride[1]);
     trans2Plan->oDist = oDist;
+
+    if(setBlueData)
+    {
+        trans2Plan->inStrideBlue  = row2Plan->outStrideBlue;
+        trans2Plan->iDistBlue     = row2Plan->oDistBlue;
+        trans2Plan->outStrideBlue = outStrideBlue;
+        std::swap(trans2Plan->outStrideBlue[0], trans2Plan->outStrideBlue[1]);
+        trans2Plan->oDistBlue = oDistBlue;
+    }
 }
 
 /*****************************************************
  * 2D_RC  *
  *****************************************************/
-void RC2DNode::BuildTree_internal()
+void RC2DNode::BuildTree_internal(const SchemeVec& child_schemes)
 {
+    bool noSolution = child_schemes.empty();
+
     // row fft
     NodeMetaData rowPlanData(this);
     rowPlanData.length.push_back(length[0]);
@@ -149,10 +207,14 @@
     {
         rowPlanData.length.push_back(length[index]);
     }
-    auto rowPlan = NodeFactory::CreateExplicitNode(rowPlanData, this);
+    // skip the decide scheme part in node factory
+    ComputeScheme determined_scheme = (noSolution) ? CS_NONE : child_schemes[0];
+    auto          rowPlan = NodeFactory::CreateExplicitNode(rowPlanData, this, determined_scheme);
     rowPlan->RecursiveBuildTree();
 
     // column fft
+    if(!noSolution)
+        assert(child_schemes[1] == CS_KERNEL_STOCKHAM_BLOCK_CC);
     auto colPlan = NodeFactory::CreateNodeFromScheme(CS_KERNEL_STOCKHAM_BLOCK_CC, this);
     colPlan->length.push_back(length[1]);
     colPlan->dimension = 1;
@@ -198,22 +260,24 @@
 /*****************************************************
  * CS_KERNEL_2D_SINGLE  *
  *****************************************************/
-bool Single2DNode::CreateTwiddleTableResource()
+bool Single2DNode::CreateDeviceResources()
 {
     // create one set of twiddles for each dimension
-    std::tie(twiddles, twiddles_size) = Repo::GetTwiddles2D(length[0], length[1], precision);
+    std::tie(twiddles, twiddles_size)
+        = Repo::GetTwiddles2D(length[0], length[1], precision, deviceProp.gcnArchName);
 
     return CreateLargeTwdTable();
 }
 
 void Single2DNode::SetupGPAndFnPtr_internal(DevFnCall& fnPtr, GridParam& gp)
 {
-    auto kernel = function_pool::get_kernel(fpkey(length[0], length[1], precision));
+    auto kernel = function_pool::get_kernel(GetKernelKey());
     fnPtr       = kernel.device_function;
     bwd         = kernel.transforms_per_block;
+    wgs         = kernel.workgroup_size;
 
     gp.b_x   = (batch + bwd - 1) / bwd;
-    gp.wgs_x = kernel.workgroup_size;
+    gp.wgs_x = wgs;
 
     // if fastest length is power of 2, pad it to avoid LDS bank conflicts
     auto padded_len0 = IsPo2(length[0]) ? length[0] + 1 : length[0];
diff -Nru rocfft-5.5.0/library/src/tree_node_3D.cpp rocfft-5.7.1/library/src/tree_node_3D.cpp
--- rocfft-5.5.0/library/src/tree_node_3D.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/tree_node_3D.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -23,21 +23,28 @@
 #include "function_pool.h"
 #include "logging.h"
 #include "node_factory.h"
+#include "tuning_helper.h"
 #include <numeric>
 
 /*****************************************************
  * 3D_RTRT  *
  *****************************************************/
-void RTRT3DNode::BuildTree_internal()
+void RTRT3DNode::BuildTree_internal(const SchemeVec& child_schemes)
 {
+    bool noSolution = child_schemes.empty();
+
     // 2d fft
     NodeMetaData xyPlanData(this);
     xyPlanData.length    = length;
     xyPlanData.dimension = 2;
-    auto xyPlan          = NodeFactory::CreateExplicitNode(xyPlanData, this);
+    // skip the decide scheme part in node factory
+    ComputeScheme determined_scheme = (noSolution) ? CS_NONE : child_schemes[0];
+    auto          xyPlan = NodeFactory::CreateExplicitNode(xyPlanData, this, determined_scheme);
     xyPlan->RecursiveBuildTree();
 
     // first transpose
+    if(!noSolution)
+        assert(child_schemes[1] == CS_KERNEL_TRANSPOSE_XY_Z);
     auto trans1Plan    = NodeFactory::CreateNodeFromScheme(CS_KERNEL_TRANSPOSE_XY_Z, this);
     trans1Plan->length = length;
     trans1Plan->SetTransposeOutputLength();
@@ -50,10 +57,14 @@
     zPlanData.length.push_back(length[2]);
     zPlanData.length.push_back(length[0]);
     zPlanData.length.push_back(length[1]);
-    auto zPlan = NodeFactory::CreateExplicitNode(zPlanData, this);
+    // skip the decide scheme part in node factory
+    determined_scheme = determined_scheme = (noSolution) ? CS_NONE : child_schemes[2];
+    auto zPlan = NodeFactory::CreateExplicitNode(zPlanData, this, determined_scheme);
     zPlan->RecursiveBuildTree();
 
     // second transpose
+    if(!noSolution)
+        assert(child_schemes[3] == CS_KERNEL_TRANSPOSE_Z_XY);
     auto trans2Plan    = NodeFactory::CreateNodeFromScheme(CS_KERNEL_TRANSPOSE_Z_XY, this);
     trans2Plan->length = zPlan->length;
     trans2Plan->SetTransposeOutputLength();
@@ -143,13 +154,17 @@
 /*****************************************************
  * 3D_TRTRTR  *
  *****************************************************/
-void TRTRTR3DNode::BuildTree_internal()
+void TRTRTR3DNode::BuildTree_internal(const SchemeVec& child_schemes)
 {
+    bool noSolution = child_schemes.empty();
+
     std::vector<size_t> cur_length = length;
 
     for(int i = 0; i < 6; i += 2)
     {
         // transpose Z_XY
+        if(!noSolution)
+            assert(child_schemes[i] == CS_KERNEL_TRANSPOSE_Z_XY);
         auto trans_plan    = NodeFactory::CreateNodeFromScheme(CS_KERNEL_TRANSPOSE_Z_XY, this);
         trans_plan->length = cur_length;
         trans_plan->SetTransposeOutputLength();
@@ -162,7 +177,9 @@
         NodeMetaData row_plan_data(this);
         row_plan_data.length    = cur_length;
         row_plan_data.dimension = 1;
-        auto row_plan           = NodeFactory::CreateExplicitNode(row_plan_data, this);
+        // skip the decide scheme part in node factory
+        ComputeScheme determined_scheme = (noSolution) ? CS_NONE : child_schemes[i + 1];
+        auto row_plan = NodeFactory::CreateExplicitNode(row_plan_data, this, determined_scheme);
         row_plan->RecursiveBuildTree();
 
         // TR
@@ -264,8 +281,10 @@
 /*****************************************************
  * CS_3D_BLOCK_RC  *
  *****************************************************/
-void BLOCKRC3DNode::BuildTree_internal()
+void BLOCKRC3DNode::BuildTree_internal(const SchemeVec& child_schemes)
 {
+    bool noSolution = child_schemes.empty();
+
     std::vector<size_t> cur_length = length;
 
     // NB:
@@ -317,7 +336,7 @@
         if(have_sbrc)
         {
             auto kernel = function_pool::get_kernel(
-                fpkey(cur_length[0], precision, CS_KERNEL_STOCKHAM_BLOCK_RC));
+                fpkey(cur_length[0], precision, CS_KERNEL_STOCKHAM_BLOCK_RC, TILE_ALIGNED));
 
             size_t otherDim = use_ZXY_sbrc ? cur_length[1] : cur_length[2];
             if(otherDim % kernel.transforms_per_block != 0)
@@ -344,8 +363,10 @@
 
         if(have_sbrc)
         {
-            auto sbrcScheme   = (use_ZXY_sbrc) ? CS_KERNEL_STOCKHAM_TRANSPOSE_Z_XY
-                                               : CS_KERNEL_STOCKHAM_TRANSPOSE_XY_Z;
+            auto sbrcScheme = (use_ZXY_sbrc) ? CS_KERNEL_STOCKHAM_TRANSPOSE_Z_XY
+                                             : CS_KERNEL_STOCKHAM_TRANSPOSE_XY_Z;
+            if(!noSolution)
+                assert(child_schemes[childNodes.size()] == sbrcScheme);
             auto sbrc_node    = NodeFactory::CreateNodeFromScheme(sbrcScheme, this);
             sbrc_node->length = cur_length;
             sbrc_node->SetTransposeOutputLength();
@@ -357,12 +378,16 @@
             NodeMetaData row_plan_data(this);
             row_plan_data.length    = cur_length;
             row_plan_data.dimension = 1;
-            auto row_plan           = NodeFactory::CreateExplicitNode(row_plan_data, this);
+            ComputeScheme determined_scheme
+                = (noSolution) ? CS_NONE : child_schemes[childNodes.size()];
+            auto row_plan = NodeFactory::CreateExplicitNode(row_plan_data, this, determined_scheme);
             row_plan->RecursiveBuildTree();
 
             // transpose XY_Z
             auto transScheme = (use_ZXY_sbrc) ? CS_KERNEL_TRANSPOSE_Z_XY : CS_KERNEL_TRANSPOSE_XY_Z;
-            auto trans_plan  = NodeFactory::CreateNodeFromScheme(transScheme, this);
+            if(!noSolution)
+                assert(child_schemes[childNodes.size() + 1] == transScheme);
+            auto trans_plan    = NodeFactory::CreateNodeFromScheme(transScheme, this);
             trans_plan->length = cur_length;
             trans_plan->SetTransposeOutputLength();
             if(!use_ZXY_sbrc)
@@ -389,11 +414,6 @@
 
 void BLOCKRC3DNode::AssignParams_internal()
 {
-    // could go as low as 3 kernels if all dimensions are SBRC-able,
-    // but less than 6.  If we ended up with 6 we should have just
-    // done 3D_TRTRTR instead.
-    assert(childNodes.size() >= 3 && childNodes.size() < 6);
-
     childNodes.front()->inStride = inStride;
     childNodes.front()->iDist    = iDist;
 
@@ -474,14 +494,18 @@
 /*****************************************************
  * CS_3D_BLOCK_CR  *
  *****************************************************/
-void BLOCKCR3DNode::BuildTree_internal()
+void BLOCKCR3DNode::BuildTree_internal(const SchemeVec& child_schemes)
 {
+    bool noSolution = child_schemes.empty();
+
     // TODO: It works only for 3 SBCR children nodes for now.
     //       The final logic will be similar to what SBRC has.
 
     std::vector<size_t> cur_length = length;
     for(int i = 0; i < 3; ++i)
     {
+        if(!noSolution)
+            assert(child_schemes[i] == CS_KERNEL_STOCKHAM_BLOCK_CR);
         auto node = NodeFactory::CreateNodeFromScheme(CS_KERNEL_STOCKHAM_BLOCK_CR, this);
         node->length.push_back(cur_length[2]);
         node->length.push_back(cur_length[0] * cur_length[1]);
@@ -526,8 +550,10 @@
 /*****************************************************
  * CS_3D_RC  *
  *****************************************************/
-void RC3DNode::BuildTree_internal()
+void RC3DNode::BuildTree_internal(const SchemeVec& child_schemes)
 {
+    bool noSolution = child_schemes.empty();
+
     // 2d fft
     NodeMetaData xyPlanData(this);
     xyPlanData.length.push_back(length[0]);
@@ -538,7 +564,9 @@
     {
         xyPlanData.length.push_back(length[index]);
     }
-    auto xyPlan = NodeFactory::CreateExplicitNode(xyPlanData, this);
+    // skip the decide scheme part in node factory
+    ComputeScheme determined_scheme = (noSolution) ? CS_NONE : child_schemes[0];
+    auto          xyPlan = NodeFactory::CreateExplicitNode(xyPlanData, this, determined_scheme);
     xyPlan->RecursiveBuildTree();
 
     // z col fft
@@ -555,16 +583,27 @@
 
     // use explicit SBCC kernel if available
     std::unique_ptr<TreeNode> zPlan;
-    if(function_pool::has_SBCC_kernel(length[2], precision))
+
+    // skip the decide scheme part in node factory
+    determined_scheme = (noSolution) ? CS_NONE : child_schemes[1];
+    if(determined_scheme)
     {
-        zPlan            = NodeFactory::CreateNodeFromScheme(CS_KERNEL_STOCKHAM_BLOCK_CC, this);
-        zPlan->length    = zPlanData.length;
-        zPlan->dimension = 1;
+        zPlan = NodeFactory::CreateExplicitNode(zPlanData, this, determined_scheme);
+        zPlan->RecursiveBuildTree();
     }
     else
     {
-        zPlan = NodeFactory::CreateExplicitNode(zPlanData, this);
-        zPlan->RecursiveBuildTree();
+        if(function_pool::has_SBCC_kernel(length[2], precision))
+        {
+            zPlan            = NodeFactory::CreateNodeFromScheme(CS_KERNEL_STOCKHAM_BLOCK_CC, this);
+            zPlan->length    = zPlanData.length;
+            zPlan->dimension = 1;
+        }
+        else
+        {
+            zPlan = NodeFactory::CreateExplicitNode(zPlanData, this);
+            zPlan->RecursiveBuildTree();
+        }
     }
 
     // RC
@@ -602,42 +641,56 @@
 /*****************************************************
  * Base Class of fused SBRC and Transpose
  *****************************************************/
-bool SBRCTranspose3DNode::KernelCheck()
+FMKey SBRCTranspose3DNode::GetKernelKey() const
 {
-    // check we have the kernel,
-    // we always have aligned, get the kernel and the bwd
-    FMKey key = fpkey(length[0], precision, scheme, TILE_ALIGNED);
-    if(!function_pool::has_function(key))
+    if(specified_key)
+        return *specified_key.get();
+
+    // NB: Need to make sure that sbrcTranstype has the correct value
+    if(sbrcTranstype == SBRC_TRANSPOSE_TYPE::NONE)
     {
-        if(LOG_TRACE_ENABLED())
-            (*LogSingleton::GetInstance().GetTraceOS()) << PrintMissingKernelInfo(key);
-        return false;
+        // find the base kernel at first
+        FMKey baseKey = fpkey(length[0], precision, scheme, TILE_ALIGNED);
+        // if we have the base kernel, then we set the exact sbrc_trans_type and return the real key
+        // if we don't, then we simply return a key with NONE sbrc_trans_type
+        // which will make KernelCheck() trigger an exception
+        if(function_pool::has_function(baseKey))
+        {
+            auto bwd      = function_pool::get_kernel(baseKey).transforms_per_block;
+            sbrcTranstype = sbrc_transpose_type(bwd);
+        }
     }
 
-    auto bwd = function_pool::get_kernel(key).transforms_per_block;
+    return fpkey(length[0], precision, scheme, sbrcTranstype);
+}
 
-    // check if we have the sbrc_type that we are actually applying
-    sbrcTranstype = sbrc_transpose_type(bwd);
-    if(!function_pool::has_function(fpkey(length[0], precision, scheme, sbrcTranstype)))
-    {
-        if(LOG_TRACE_ENABLED())
-            (*LogSingleton::GetInstance().GetTraceOS())
-                << PrintMissingKernelInfo(fpkey(length[0], precision, scheme, sbrcTranstype));
+bool SBRCTranspose3DNode::KernelCheck(std::vector<FMKey>& kernel_keys)
+{
+    bool res = LeafNode::KernelCheck(kernel_keys);
+    if(!res)
         return false;
-    }
 
-    dir2regMode = (function_pool::get_kernel(key).direct_to_from_reg)
-                      ? DirectRegType::TRY_ENABLE_IF_SUPPORT
-                      : DirectRegType::FORCE_OFF_OR_NOT_SUPPORT;
+    // if we are doing tuning or running with the tuned solution, we have the specified_key.
+    // we must directly run the kernel with the exact setting as the config
+    // without the hardcoded tuning
+    if(specified_key != nullptr)
+        return true;
 
-    // set according to benchmark
-    SetDirectRegType();
+    // hardocded-tuning according to benchmark
+    TuneDirectRegType();
 
     return true;
 }
 
-void SBRCTranspose3DNode::SetDirectRegType()
+void SBRCTranspose3DNode::TuneDirectRegType()
 {
+    // half precision has not been tested yet, disable it for now.
+    if(precision == rocfft_precision_half)
+    {
+        dir2regMode = FORCE_OFF_OR_NOT_SUPPORT;
+        return;
+    }
+
     if(is_device_gcn_arch(deviceProp, "gfx906"))
     {
         // bad results from benchmark:
@@ -645,7 +698,7 @@
         //     {64,dp}, {81,dp}, {100,dp} are bad
         std::map<rocfft_precision, std::set<size_t>> exceptions
             = {{rocfft_precision_single, {49, 128}}, {rocfft_precision_double, {64, 81, 100}}};
-        if(exceptions.at(precision).count(length[0]))
+        if(length_excepted(exceptions, precision, length[0]))
             dir2regMode = FORCE_OFF_OR_NOT_SUPPORT;
     }
     else if(is_device_gcn_arch(deviceProp, "gfx908"))
@@ -656,7 +709,7 @@
         std::map<rocfft_precision, std::set<size_t>> exceptions
             = {{rocfft_precision_single, {81, 100, 128, 192, 200, 512}},
                {rocfft_precision_double, {81, 512}}};
-        if(exceptions.at(precision).count(length[0]))
+        if(length_excepted(exceptions, precision, length[0]))
             dir2regMode = FORCE_OFF_OR_NOT_SUPPORT;
     }
     else if(is_device_gcn_arch(deviceProp, "gfx90a"))
@@ -667,7 +720,7 @@
         std::map<rocfft_precision, std::set<size_t>> exceptions
             = {{rocfft_precision_single, {49, 64, 81, 125, 128, 192, 200, 512}},
                {rocfft_precision_double, {64, 81, 100, 125}}};
-        if(exceptions.at(precision).count(length[0]))
+        if(length_excepted(exceptions, precision, length[0]))
             dir2regMode = FORCE_OFF_OR_NOT_SUPPORT;
     }
     // we don't enable the features for others
@@ -683,15 +736,14 @@
  *****************************************************/
 void SBRCTransXY_ZNode::SetupGPAndFnPtr_internal(DevFnCall& fnPtr, GridParam& gp)
 {
-    auto kernel
-        = function_pool::get_kernel(fpkey(length[0], precision, CS_KERNEL_STOCKHAM_BLOCK_RC));
-    bwd           = kernel.transforms_per_block;
-    wgs           = kernel.workgroup_size;
-    lds           = length[0] * bwd;
-    sbrcTranstype = sbrc_transpose_type(bwd);
-    fnPtr         = function_pool::get_function(fpkey(length[0], precision, scheme, sbrcTranstype));
-    gp.b_x        = DivRoundingUp(length[2], bwd) * length[1] * batch;
-    gp.wgs_x      = kernel.workgroup_size;
+    // sbrcTransType has already been assigned in KernelCheck();
+    auto kernel = function_pool::get_kernel(GetKernelKey());
+    fnPtr       = kernel.device_function;
+    bwd         = kernel.transforms_per_block;
+    wgs         = kernel.workgroup_size;
+    lds         = length[0] * bwd;
+    gp.b_x      = DivRoundingUp(length[2], bwd) * length[1] * batch;
+    gp.wgs_x    = wgs;
 }
 
 /*****************************************************
@@ -700,15 +752,14 @@
  *****************************************************/
 void SBRCTransZ_XYNode::SetupGPAndFnPtr_internal(DevFnCall& fnPtr, GridParam& gp)
 {
-    auto kernel
-        = function_pool::get_kernel(fpkey(length[0], precision, CS_KERNEL_STOCKHAM_BLOCK_RC));
-    bwd           = kernel.transforms_per_block;
-    wgs           = kernel.workgroup_size;
-    lds           = length[0] * bwd;
-    sbrcTranstype = sbrc_transpose_type(bwd);
-    fnPtr         = function_pool::get_function(fpkey(length[0], precision, scheme, sbrcTranstype));
-    gp.b_x        = DivRoundingUp(length[1], bwd) * length[2] * batch;
-    gp.wgs_x      = kernel.workgroup_size;
+    // sbrcTransType has already been assigned in KernelCheck();
+    auto kernel = function_pool::get_kernel(GetKernelKey());
+    fnPtr       = kernel.device_function;
+    bwd         = kernel.transforms_per_block;
+    wgs         = kernel.workgroup_size;
+    lds         = length[0] * bwd;
+    gp.b_x      = DivRoundingUp(length[1], bwd) * length[2] * batch;
+    gp.wgs_x    = wgs;
 }
 
 /*****************************************************
@@ -717,16 +768,15 @@
  *****************************************************/
 void RealCmplxTransZ_XYNode::SetupGPAndFnPtr_internal(DevFnCall& fnPtr, GridParam& gp)
 {
-    auto kernel
-        = function_pool::get_kernel(fpkey(length[0], precision, CS_KERNEL_STOCKHAM_BLOCK_RC));
-    bwd           = kernel.transforms_per_block;
-    wgs           = kernel.workgroup_size;
-    lds           = length[0] * bwd;
-    lds_padding   = 1;
-    sbrcTranstype = sbrc_transpose_type(bwd);
-    fnPtr         = function_pool::get_function(fpkey(length[0], precision, scheme, sbrcTranstype));
-    gp.b_x        = DivRoundingUp(length[1], bwd) * length[2] * batch;
-    gp.wgs_x      = kernel.workgroup_size;
+    // sbrcTransType has already been assigned in KernelCheck();
+    auto kernel = function_pool::get_kernel(GetKernelKey());
+    fnPtr       = kernel.device_function;
+    bwd         = kernel.transforms_per_block;
+    wgs         = kernel.workgroup_size;
+    lds_padding = 1;
+    lds         = (length[0] + lds_padding) * bwd;
+    gp.b_x      = DivRoundingUp(length[1], bwd) * length[2] * batch;
+    gp.wgs_x    = wgs;
 }
 
 bool RealCmplxTransZ_XYNode::CreateDevKernelArgs()
diff -Nru rocfft-5.5.0/library/src/tree_node_bluestein.cpp rocfft-5.7.1/library/src/tree_node_bluestein.cpp
--- rocfft-5.5.0/library/src/tree_node_bluestein.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/tree_node_bluestein.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -24,7 +24,7 @@
 #include "node_factory.h"
 #include <numeric>
 
-inline size_t FindBlue(size_t len, rocfft_precision precision, bool forcePow2)
+size_t BluesteinNode::FindBlue(size_t len, rocfft_precision precision, bool forcePow2)
 {
     if(forcePow2)
     {
@@ -60,34 +60,70 @@
     return length;
 }
 
-/*****************************************************
- * CS_BLUESTEIN
- *****************************************************/
-void BluesteinNode::BuildTree_internal()
+BluesteinType BluesteinNode::DecideBlueType()
 {
     bool useSingleKernel = BluesteinSingleNode::SizeFits(length[0], precision);
 
-    // Build a node for a 1D stage using the Bluestein algorithm for
-    // general transform lengths.
-
     // single kernel sticks to pow2 lengthBlue.  the kernel does many
     // other things besides FFTs, so keep radices simple to reduce
     // VGPR usage.
     lengthBlue = FindBlue(length[0], precision, useSingleKernel);
 
-    auto chirpPlan       = NodeFactory::CreateNodeFromScheme(CS_KERNEL_CHIRP, this);
-    chirpPlan->dimension = 1;
-    chirpPlan->length.push_back(length[0]);
-    chirpPlan->lengthBlue = lengthBlue;
-    chirpPlan->direction  = direction;
-    chirpPlan->batch      = 1;
-    chirpPlan->large1D    = 2 * length[0];
-
-    // single kernel requires a single lengthBlue FFT on the second
-    // half of chirp buffer before we do the rest of the Bluestein
-    // steps that kernel
     if(useSingleKernel)
+        return BluesteinType::BT_SINGLE_KERNEL;
+
+    NodeMetaData bluePlanData(this);
+    bluePlanData.length.push_back(lengthBlue);
+    bluePlanData.direction = direction;
+    bluePlanData.batch     = batch;
+
+    auto scheme = NodeFactory::DecideNodeScheme(bluePlanData, this);
+
+    if(scheme == CS_L1D_CC)
     {
+        // Allow fused Bluestein optimization only for 1D
+        // complex forward and complex inverse transforms.
+        auto fusedBluesteinAllow = (parent) ? false : true;
+
+        auto type = fusedBluesteinAllow ? BluesteinType::BT_MULTI_KERNEL_FUSED
+                                        : BluesteinType::BT_MULTI_KERNEL;
+
+        return type;
+    }
+
+    if(scheme == CS_L1D_CRT || scheme == CS_L1D_TRTRT)
+        return BluesteinType::BT_MULTI_KERNEL;
+
+    return BluesteinType::BT_NONE;
+}
+
+/*****************************************************
+ * CS_BLUESTEIN
+ *****************************************************/
+void BluesteinNode::BuildTree_internal(const SchemeVec& child_schemes)
+{
+    // Build a node for a 1D stage using the Bluestein algorithm for
+    // general transform lengths.
+    typeBlue = DecideBlueType();
+
+    switch(typeBlue)
+    {
+    case BT_SINGLE_KERNEL:
+    {
+        // single kernel requires a single lengthBlue FFT on the second
+        // half of chirp buffer before we do the rest of the Bluestein
+        // steps that kernel
+
+        typeBlue = BluesteinType::BT_SINGLE_KERNEL;
+
+        auto chirpPlan       = NodeFactory::CreateNodeFromScheme(CS_KERNEL_CHIRP, this);
+        chirpPlan->dimension = 1;
+        chirpPlan->length.push_back(length[0]);
+        chirpPlan->lengthBlue = lengthBlue;
+        chirpPlan->direction  = direction;
+        chirpPlan->batch      = 1;
+        chirpPlan->large1D    = 2 * length[0];
+
         NodeMetaData chirpFFTPlanData(this);
         chirpFFTPlanData.dimension = 1;
         chirpFFTPlanData.length.push_back(lengthBlue);
@@ -105,10 +141,85 @@
         childNodes.emplace_back(std::move(chirpPlan));
         childNodes.emplace_back(std::move(chirpFFTPlan));
         childNodes.emplace_back(std::move(singlePlan));
+
+        break;
+    }
+    case BT_MULTI_KERNEL_FUSED:
+    {
+        typeBlue = BluesteinType::BT_MULTI_KERNEL_FUSED;
+
+        // first node: fused chirp + padding + forward fft
+        NodeMetaData fftFwdChirpPadPlanData(this);
+        fftFwdChirpPadPlanData.dimension = 1;
+        fftFwdChirpPadPlanData.length.push_back(lengthBlue);
+        fftFwdChirpPadPlanData.batch = 1;
+        auto fftFwdChirpPadPlan = NodeFactory::CreateExplicitNode(fftFwdChirpPadPlanData, this);
+        fftFwdChirpPadPlan->direction   = direction;
+        fftFwdChirpPadPlan->typeBlue    = typeBlue;
+        fftFwdChirpPadPlan->fuseBlue    = BluesteinFuseType::BFT_FWD_CHIRP;
+        fftFwdChirpPadPlan->lengthBlue  = lengthBlue;
+        fftFwdChirpPadPlan->lengthBlueN = length[0];
+        fftFwdChirpPadPlan->comments.push_back("Fused chirp + padding w/ fwd FFT");
+        fftFwdChirpPadPlan->RecursiveBuildTree();
+        for(auto& child : fftFwdChirpPadPlan->childNodes)
+            child->comments.push_back("Stockham kernel fused with Bluestein ops");
+
+        // second node: fused chirp / input Hadamard product + padding + forward fft
+        NodeMetaData fftFwdChirpMulPadPlanData(this);
+        fftFwdChirpMulPadPlanData.dimension = 1;
+        fftFwdChirpMulPadPlanData.length.push_back(lengthBlue);
+        for(size_t index = 1; index < length.size(); index++)
+        {
+            fftFwdChirpMulPadPlanData.length.push_back(length[index]);
+        }
+        auto fftFwdChirpMulPadPlan
+            = NodeFactory::CreateExplicitNode(fftFwdChirpMulPadPlanData, this);
+        fftFwdChirpMulPadPlan->direction   = direction;
+        fftFwdChirpMulPadPlan->lengthBlue  = lengthBlue;
+        fftFwdChirpMulPadPlan->lengthBlueN = length[0];
+        fftFwdChirpMulPadPlan->typeBlue    = typeBlue;
+        fftFwdChirpMulPadPlan->fuseBlue    = BluesteinFuseType::BFT_FWD_CHIRP_MUL;
+        fftFwdChirpMulPadPlan->comments.push_back(
+            "Fused chirp/input Hadamard prod + padding w/ fwd FFT");
+        fftFwdChirpMulPadPlan->RecursiveBuildTree();
+        for(auto& child : fftFwdChirpMulPadPlan->childNodes)
+            child->comments.push_back("Stockham kernel fused with Bluestein ops");
+
+        // third node: fused convolution Hadamard product + inverse fft + chirp Hadamard product
+        NodeMetaData fftInvMulChirpMulPlanData(this);
+        fftInvMulChirpMulPlanData.dimension = 1;
+        fftInvMulChirpMulPlanData.direction = -direction;
+        fftInvMulChirpMulPlanData.length.push_back(lengthBlue);
+        for(size_t index = 1; index < length.size(); index++)
+        {
+            fftInvMulChirpMulPlanData.length.push_back(length[index]);
+        }
+        auto fftInvMulChirpMulPlan
+            = NodeFactory::CreateExplicitNode(fftInvMulChirpMulPlanData, this);
+        fftInvMulChirpMulPlan->lengthBlue  = lengthBlue;
+        fftInvMulChirpMulPlan->lengthBlueN = length[0];
+        fftInvMulChirpMulPlan->typeBlue    = typeBlue;
+        fftInvMulChirpMulPlan->fuseBlue    = BluesteinFuseType::BFT_INV_CHIRP_MUL;
+        fftInvMulChirpMulPlan->comments.push_back(
+            "Fused convolution input Hadamard prod + chirp/output Hadamard prod w/ inv FFT");
+        fftInvMulChirpMulPlan->RecursiveBuildTree();
+        for(auto& child : fftInvMulChirpMulPlan->childNodes)
+            child->comments.push_back("Stockham kernel fused with Bluestein ops");
+
+        childNodes.emplace_back(std::move(fftFwdChirpPadPlan));
+        childNodes.emplace_back(std::move(fftFwdChirpMulPadPlan));
+        childNodes.emplace_back(std::move(fftInvMulChirpMulPlan));
+        break;
     }
-    else
+    case BT_MULTI_KERNEL:
     {
-        // otherwise, use multiple kernels for all the Bluestein steps
+        auto chirpPlan       = NodeFactory::CreateNodeFromScheme(CS_KERNEL_CHIRP, this);
+        chirpPlan->dimension = 1;
+        chirpPlan->length.push_back(length[0]);
+        chirpPlan->lengthBlue = lengthBlue;
+        chirpPlan->direction  = direction;
+        chirpPlan->batch      = 1;
+        chirpPlan->large1D    = 2 * length[0];
 
         auto padmulPlan        = NodeFactory::CreateNodeFromScheme(CS_KERNEL_PAD_MUL, this);
         padmulPlan->dimension  = 1;
@@ -165,14 +276,19 @@
         childNodes.emplace_back(std::move(fftmulPlan));
         childNodes.emplace_back(std::move(fftrPlan));
         childNodes.emplace_back(std::move(resmulPlan));
+
+        break;
+    }
+    case BT_NONE:
+        throw std::runtime_error("Invalid Bluestein type");
     }
 }
 
 void BluesteinNode::AssignParams_internal()
 {
-    // should either be in a 3-kernel BLUESTEIN_SINGLE plan, or a
-    // 6-kernel multi-kernel Bluestein plan
-    if(childNodes.size() == 3)
+    switch(typeBlue)
+    {
+    case BT_SINGLE_KERNEL:
     {
         auto& chirpPlan    = childNodes[0];
         auto& chirpFFTPlan = childNodes[1];
@@ -194,8 +310,48 @@
         singlePlan->outStride = outStride;
         singlePlan->oDist     = oDist;
         singlePlan->AssignParams();
+
+        break;
     }
-    else if(childNodes.size() == 6)
+    case BT_MULTI_KERNEL_FUSED:
+    {
+        auto& fftFwdChirpPadPlan    = childNodes[0];
+        auto& fftFwdChirpMulPadPlan = childNodes[1];
+        auto& fftInvMulChirpMulPlan = childNodes[2];
+
+        fftFwdChirpPadPlan->inStride.push_back(1);
+        fftFwdChirpPadPlan->inStrideBlue.push_back(1);
+        fftFwdChirpPadPlan->iDist     = fftFwdChirpPadPlan->lengthBlueN;
+        fftFwdChirpPadPlan->iDistBlue = fftFwdChirpPadPlan->lengthBlue;
+        fftFwdChirpPadPlan->outStride.push_back(1);
+        fftFwdChirpPadPlan->outStrideBlue.push_back(1);
+        fftFwdChirpPadPlan->oDist     = fftFwdChirpPadPlan->lengthBlueN;
+        fftFwdChirpPadPlan->oDistBlue = fftFwdChirpPadPlan->lengthBlue;
+        fftFwdChirpPadPlan->AssignParams();
+
+        fftFwdChirpMulPadPlan->inStride      = inStride;
+        fftFwdChirpMulPadPlan->inStrideBlue  = inStrideBlue;
+        fftFwdChirpMulPadPlan->iDist         = iDist;
+        fftFwdChirpMulPadPlan->iDistBlue     = iDistBlue;
+        fftFwdChirpMulPadPlan->outStride     = outStride;
+        fftFwdChirpMulPadPlan->outStrideBlue = outStrideBlue;
+        fftFwdChirpMulPadPlan->oDist         = oDist;
+        fftFwdChirpMulPadPlan->oDistBlue     = oDistBlue;
+        fftFwdChirpMulPadPlan->AssignParams();
+
+        fftInvMulChirpMulPlan->inStride      = inStride;
+        fftInvMulChirpMulPlan->inStrideBlue  = inStrideBlue;
+        fftInvMulChirpMulPlan->iDist         = iDist;
+        fftInvMulChirpMulPlan->iDistBlue     = iDistBlue;
+        fftInvMulChirpMulPlan->outStride     = outStride;
+        fftInvMulChirpMulPlan->outStrideBlue = outStrideBlue;
+        fftInvMulChirpMulPlan->oDist         = oDist;
+        fftInvMulChirpMulPlan->oDistBlue     = oDistBlue;
+        fftInvMulChirpMulPlan->AssignParams();
+
+        break;
+    }
+    case BT_MULTI_KERNEL:
     {
         auto& chirpPlan  = childNodes[0];
         auto& padmulPlan = childNodes[1];
@@ -243,10 +399,11 @@
         resmulPlan->iDist     = fftrPlan->oDist;
         resmulPlan->outStride = outStride;
         resmulPlan->oDist     = oDist;
+
+        break;
     }
-    else
-    {
-        throw std::runtime_error("unexpected bluestein plan shape");
+    case BT_NONE:
+        throw std::runtime_error("Invalid Bluestein type");
     }
 }
 
diff -Nru rocfft-5.5.0/library/src/tree_node_real.cpp rocfft-5.7.1/library/src/tree_node_real.cpp
--- rocfft-5.5.0/library/src/tree_node_real.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/tree_node_real.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -105,7 +105,7 @@
 /*****************************************************
  * CS_REAL_TRANSFORM_USING_CMPLX
  *****************************************************/
-void RealTransCmplxNode::BuildTree_internal()
+void RealTransCmplxNode::BuildTree_internal(const SchemeVec& child_schemes)
 {
     // Embed the data into a full-length complex array, perform a
     // complex transform, and then extract the relevant output.
@@ -186,7 +186,7 @@
 /*****************************************************
  * CS_REAL_TRANSFORM_EVEN
  *****************************************************/
-void RealTransEvenNode::BuildTree_internal()
+void RealTransEvenNode::BuildTree_internal(const SchemeVec& child_schemes)
 {
     // Fastest moving dimension must be even:
     assert(length[0] % 2 == 0);
@@ -432,7 +432,7 @@
 /*****************************************************
  * CS_REAL_2D_EVEN
  *****************************************************/
-void Real2DEvenNode::BuildTree_internal()
+void Real2DEvenNode::BuildTree_internal(const SchemeVec& child_schemes)
 {
     // Fastest moving dimension must be even:
     assert(length[0] % 2 == 0);
@@ -779,7 +779,7 @@
 /*****************************************************
  * CS_REAL_3D_EVEN
  *****************************************************/
-void Real3DEvenNode::BuildTree_internal()
+void Real3DEvenNode::BuildTree_internal(const SchemeVec& child_schemes)
 {
     Build_solution();
 
diff -Nru rocfft-5.5.0/library/src/tuning_helper.cpp rocfft-5.7.1/library/src/tuning_helper.cpp
--- rocfft-5.5.0/library/src/tuning_helper.cpp	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/tuning_helper.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,497 @@
+// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "tuning_helper.h"
+#include "../../shared/environment.h"
+#include "function_pool.h"
+#include "rocfft.h"
+#include "solution_map.h"
+#include "twiddles.h"
+
+#include <fstream>
+#include <iterator>
+#include <random>
+#include <set>
+#include <unordered_set>
+
+static const char* results_folder = "ResultSolutions";
+static const char* csv_out_folder = "TuningData";
+
+TuningBenchmarker::~TuningBenchmarker()
+{
+    if(binding_solution_map != nullptr || packet != nullptr)
+        Clean();
+}
+
+void TuningBenchmarker::Setup()
+{
+    packet = std::unique_ptr<rocfft_tuning_packet>(new rocfft_tuning_packet());
+
+    std::string dump_str = rocfft_getenv("DUMP_TUNING");
+    if(!dump_str.empty())
+        packet->dump_candidates = true;
+
+    std::string exact_str = rocfft_getenv("TUNE_EXACT_PROB");
+    if(!exact_str.empty())
+        packet->export_full_token = true;
+}
+
+void TuningBenchmarker::Clean()
+{
+    binding_solution_map = nullptr;
+    if(packet != nullptr)
+        packet.reset();
+    packet = nullptr;
+}
+
+rocfft_tuning_packet* TuningBenchmarker::GetPacket()
+{
+    return packet.get();
+}
+
+void TuningBenchmarker::SetBindingSolutionMap(solution_map* sol_map)
+{
+    binding_solution_map = sol_map;
+}
+
+solution_map* TuningBenchmarker::GetBindingSolutionMap()
+{
+    return binding_solution_map;
+}
+
+bool TuningBenchmarker::SetInitStep(int tuning_phase)
+{
+    packet->init_step      = true;
+    packet->is_tuning      = false;
+    packet->tuning_phase   = tuning_phase;
+    packet->total_nodes    = 0;
+    packet->current_ssn    = -1;
+    packet->tuning_node_id = -1;
+
+    packet->total_candidates.clear();
+
+    for(auto& infos_for_candidates : benchmark_infos_of_node)
+        infos_for_candidates.clear();
+
+    return true;
+}
+
+bool TuningBenchmarker::IsInitializingTuning()
+{
+    return (packet && packet->init_step);
+}
+
+// true when during tuning, guarentee a valid GetPacket()
+bool TuningBenchmarker::IsProcessingTuning()
+{
+    return (packet && packet->is_tuning);
+}
+
+// Called between each candidate
+void TuningBenchmarker::ResetKernelInfo()
+{
+    int num_nodes = packet->total_nodes;
+
+    packet->kernel_names.clear();
+    packet->factors_str.clear();
+    packet->util_rate.clear();
+    packet->bw_effs.clear();
+    packet->num_of_blocks.clear();
+    packet->wgs.clear();
+    packet->tpt.clear();
+    packet->tpb.clear();
+    packet->lds_bytes.clear();
+    packet->occupancy.clear();
+
+    packet->kernel_names.resize(num_nodes);
+    packet->factors_str.resize(num_nodes);
+    packet->util_rate.resize(num_nodes);
+    packet->bw_effs.resize(num_nodes);
+    packet->num_of_blocks.resize(num_nodes);
+    packet->wgs.resize(num_nodes);
+    packet->tpt.resize(num_nodes);
+    packet->tpb.resize(num_nodes);
+    packet->lds_bytes.resize(num_nodes);
+    packet->occupancy.resize(num_nodes);
+}
+
+int TuningBenchmarker::UpdateNumOfTuningNodes()
+{
+    if(!packet)
+        return 0;
+
+    int total_nodes = packet->total_nodes;
+
+    benchmark_infos_of_node.resize(total_nodes);
+    packet->winner_phases.resize(total_nodes);
+    packet->winner_ids.resize(total_nodes);
+    packet->winner_kernel_names.resize(total_nodes);
+    packet->target_factors.resize(total_nodes);
+
+    return total_nodes;
+}
+
+int TuningBenchmarker::GetNumOfKernelCandidates(size_t node_id)
+{
+    if(packet && packet->total_candidates.size() > node_id)
+        return packet->total_candidates[node_id];
+
+    return 0;
+}
+
+bool TuningBenchmarker::SetCurrentTuningNodeId(size_t node_id)
+{
+    if(node_id >= (size_t)packet->total_nodes)
+        return false;
+
+    packet->tuning_node_id = node_id;
+    return true;
+}
+
+bool TuningBenchmarker::SetCurrentKernelCandidateId(size_t kernel_config_id)
+{
+    size_t curr_node_id = packet->tuning_node_id;
+    if(kernel_config_id >= (size_t)packet->total_candidates[curr_node_id])
+        return false;
+
+    packet->current_ssn = kernel_config_id;
+    ResetKernelInfo();
+    return true;
+}
+
+BenchmarkInfo TuningBenchmarker::GetCurrBenchmarkInfo()
+{
+    int   curr_phase       = packet->tuning_phase;
+    int   tuning_node_id   = packet->tuning_node_id;
+    int   kernel_config_id = packet->current_ssn;
+    auto& bench_infos_vec  = benchmark_infos_of_node[tuning_node_id];
+
+    BenchmarkInfo info;
+    info.tuning_phase      = curr_phase;
+    info.SSN               = kernel_config_id;
+    info.prob_token        = packet->tuning_problem_name;
+    info.kernel_name       = packet->kernel_names[tuning_node_id];
+    info.factors_str       = packet->factors_str[tuning_node_id];
+    info.util_rate         = packet->util_rate[tuning_node_id];
+    info.num_blocks        = packet->num_of_blocks[tuning_node_id];
+    info.workgroup_size    = packet->wgs[tuning_node_id];
+    info.threads_per_trans = packet->tpt[tuning_node_id];
+    info.trans_per_block   = packet->tpb[tuning_node_id];
+    info.LDS_bytes         = packet->lds_bytes[tuning_node_id];
+    info.occupancy         = packet->occupancy[tuning_node_id];
+    info.numCUs            = packet->numCUs;
+    info.granularity       = (double)info.num_blocks / info.numCUs;
+
+    bench_infos_vec.push_back(info);
+
+    return info;
+}
+
+void TuningBenchmarker::UpdateCurrBenchResult(double ms, double gflops)
+{
+    int    curr_tuning_node_id   = packet->tuning_node_id;
+    int    curr_kernel_config_id = packet->current_ssn;
+    double curr_node_bw_eff      = packet->bw_effs[curr_tuning_node_id];
+
+    // un-sorted
+    auto& bench_infos_vec = benchmark_infos_of_node[curr_tuning_node_id];
+    auto& info            = bench_infos_vec[curr_kernel_config_id];
+    info.bw_eff           = curr_node_bw_eff;
+    info.milli_seconds    = ms;
+    info.gflops           = gflops;
+}
+
+void TuningBenchmarker::FindWinnerForCurrNode(double&      curr_best_msec,
+                                              int&         winner_phase,
+                                              int&         winner_config_id,
+                                              std::string& winner_kernel_name)
+{
+    int   curr_tuning_node_id = packet->tuning_node_id;
+    auto& bench_infos_vec     = benchmark_infos_of_node[curr_tuning_node_id];
+
+    // set 0 to build-in kernels
+    if(packet->is_builtin_kernel[curr_tuning_node_id])
+    {
+        winner_phase       = 0;
+        winner_config_id   = 0;
+        winner_kernel_name = "built_in_kernel";
+    }
+    else
+    {
+        // set to previous result in case there is no new candidate in this phase
+        // for example, len-125 can only be 5x5x5 so it does nothing in phase-2
+        winner_phase       = packet->winner_phases[curr_tuning_node_id];
+        winner_config_id   = packet->winner_ids[curr_tuning_node_id];
+        winner_kernel_name = packet->winner_kernel_names[curr_tuning_node_id];
+    }
+
+    // if not empty, then sort and update IDs
+    if(!bench_infos_vec.empty())
+    {
+        std::sort(bench_infos_vec.begin(),
+                  bench_infos_vec.end(),
+                  [](BenchmarkInfo& a, BenchmarkInfo& b) { return a.gflops > b.gflops; });
+
+        // check if the best of this phase is better than previous winner
+        auto& winner_of_this_phase = bench_infos_vec.front();
+        if(winner_of_this_phase.milli_seconds < curr_best_msec)
+        {
+            winner_phase       = winner_of_this_phase.tuning_phase;
+            winner_config_id   = winner_of_this_phase.SSN;
+            winner_kernel_name = winner_of_this_phase.kernel_name;
+
+            // update the best ms up to now
+            curr_best_msec = winner_of_this_phase.milli_seconds;
+        }
+    }
+
+    packet->winner_phases[curr_tuning_node_id]       = winner_phase;
+    packet->winner_ids[curr_tuning_node_id]          = winner_config_id;
+    packet->winner_kernel_names[curr_tuning_node_id] = winner_kernel_name;
+}
+
+void TuningBenchmarker::PropagateBestFactorsToNextPhase()
+{
+    std::vector<std::string> best_factors;
+
+    // the infos are already sorted by gflops,
+    // we need to return the factors in the order of their best one
+    int   curr_tuning_node_id = packet->tuning_node_id;
+    auto& bench_infos_vec     = benchmark_infos_of_node[curr_tuning_node_id];
+
+    // clear previous data
+    packet->target_factors[curr_tuning_node_id].clear();
+
+    std::set<std::string> seen_factors;
+    for(auto& info : bench_infos_vec)
+    {
+        if(seen_factors.count(info.factors_str) == 0)
+        {
+            seen_factors.insert(info.factors_str);
+            best_factors.push_back(info.factors_str);
+        }
+    }
+    // we will focus on the best 3 factors (at most) in the next phase tuning (permuting)
+    if(best_factors.size() > 3)
+        best_factors.resize(3);
+
+    for(auto& factor : best_factors)
+        packet->target_factors[curr_tuning_node_id].insert(factor);
+}
+
+// Export the winner solution
+void TuningBenchmarker::ExportWinnerToSolutions()
+{
+    // Informations of Winner kernels
+    std::vector<int>&         winners           = packet->winner_ids;
+    std::vector<int>&         winners_phase     = packet->winner_phases;
+    std::vector<bool>&        is_builtin_kernel = packet->is_builtin_kernel;
+    std::vector<std::string>& kernelTokens      = packet->tuning_kernel_tokens;
+
+    // root solution node for the tuning problem
+    std::string rootToken    = packet->tuning_problem_name;
+    std::string archName     = packet->tuning_arch_name;
+    ProblemKey  rootKey      = ProblemKey(archName, rootToken);
+    auto&       rootSolution = binding_solution_map->get_solution_node(rootKey);
+
+    auto InsertWinnerKernelSolution = [&](size_t kernel_node_id) -> size_t {
+        // get the default kernel token (without extra phase, bench ...etc)
+        int         kernel_winner = winners[kernel_node_id];
+        int         phase_idx     = winners_phase[kernel_node_id];
+        bool        is_builtin    = is_builtin_kernel[kernel_node_id];
+        std::string kernelToken   = kernelTokens[kernel_node_id];
+        std::string srcToken      = kernelToken;
+
+        // for a tuning external kernel, we fetch the winner/phase from the candidate and get the token
+        // a built-in kernel has fixed (not elaborated one) token string
+        if(is_builtin == false)
+        {
+            srcToken += std::string("_leafnode_") + std::to_string(kernel_node_id);
+            srcToken += std::string("_phase_") + std::to_string(phase_idx);
+        }
+
+        // src = elaborated tuningToken (with extra phase, bench...etc),
+        // dst = token without them.
+        std::string dstToken = kernelToken;
+        ProblemKey  srcKey(archName, srcToken);
+        ProblemKey  dstKey(archName, dstToken);
+
+        // get the winner of the node from current solution map (primary map)
+        SolutionNode kernel_solution
+            = binding_solution_map->get_solution_node(srcKey, kernel_winner);
+
+        // and insert to temp_working_map (isRoot=false, check_dup=true, primary=false)
+        return binding_solution_map->add_solution(dstKey, kernel_solution, false, true, false);
+    };
+
+    size_t current_adding_kernel_node = 0;
+
+    // This recursion will add solution from bottom to up:
+    // recursively add all the solution-node to from current map to tuning_map, and need to check ducplication.
+    auto RecursivelyAddSolution
+        = [&](ProblemKey& key, SolutionNode& solution, auto&& RecursivelyAddSolution) -> size_t {
+        // if adding a leaf node (together with a kernel node)
+        // then we add the kernel-solution from winners in tuning packet,
+        // and get the child_option of that kernel, update this value to the leaf node child_option
+        if(solution.sol_node_type == SOL_LEAF_NODE)
+        {
+            // insert kernel to tuning_map, return the position(index) in the vector
+            size_t child_option = InsertWinnerKernelSolution(current_adding_kernel_node);
+            ++current_adding_kernel_node;
+
+            // update the child_option value for tuning_map
+            solution.solution_childnodes[0].child_option = child_option;
+        }
+        // Adding an internal node, simply recursively adding its children
+        else
+        {
+            for(auto& child : solution.solution_childnodes)
+            {
+                ProblemKey childKey(archName, child.child_token);
+
+                // get the child solution object from current solution map (using existing child_option)
+                auto& childSol
+                    = binding_solution_map->get_solution_node(childKey, child.child_option);
+
+                // since we are add solution to another solution map , so we have to update the child option
+                child.child_option
+                    = RecursivelyAddSolution(childKey, childSol, RecursivelyAddSolution);
+            }
+        }
+
+        // add itself with new child_option value.
+        // the add_solution(s) here is to build a final solution tree with proper option_id
+        // so we need to check_dup.
+        // insert to temp_working_map (check_dup=true, primary=false)
+        bool isSolutionRoot = (key == rootKey);
+        return binding_solution_map->add_solution(key, solution, isSolutionRoot, true, false);
+    };
+    // Call funtion !!
+    RecursivelyAddSolution(rootKey, rootSolution, RecursivelyAddSolution);
+
+    // Then output to a solution map file from the temp_working_map;
+    // export to solution map dat file
+    std::string filename         = archName + "_" + rootToken + ".dat";
+    std::string workspace_folder = "";
+    workspace_folder             = rocfft_getenv("TUNING_WORKSPACE");
+
+    fs::path result_path(workspace_folder.c_str());
+    result_path /= results_folder;
+    result_path /= filename.c_str();
+
+    // sort=true, primary_map=false
+    binding_solution_map->write_solution_map_data(result_path, true, false);
+
+    packet->output_solution_map_path = result_path.string();
+}
+
+void TuningBenchmarker::GetOutputSolutionMapPath(std::string& out_path)
+{
+    out_path = packet->output_solution_map_path;
+}
+
+bool TuningBenchmarker::ExportCSV(bool append_data)
+{
+    int curr_tuning_node_id = packet->tuning_node_id;
+
+    // skip the kernel-node with #-candidates = 0
+    if(packet->total_candidates[curr_tuning_node_id] == 0)
+        return true;
+
+    auto&       bench_infos_vec  = benchmark_infos_of_node[curr_tuning_node_id];
+    std::string filename         = bench_infos_vec[0].prob_token + ".csv";
+    std::string workspace_folder = "";
+
+    workspace_folder = rocfft_getenv("TUNING_WORKSPACE");
+
+    fs::path csv_path(workspace_folder.c_str());
+    csv_path /= csv_out_folder;
+    csv_path /= filename.c_str();
+
+    std::ofstream outfile;
+    outfile.open(csv_path.c_str(),
+                 (append_data) ? (std::ios::out | std::ios::app | std::ios::ate)
+                               : (std::ios::out | std::ios::trunc));
+    if(!outfile.is_open())
+        return false;
+
+    // if appending to file, add a few new lines
+    if(append_data)
+        outfile << std::endl << std::endl;
+
+    outfile << "SSN, Problem, MS, GFLOPS, NumBlocks, WGS, TPT, TPB, LDS_Bytes, Util_Rate, "
+               "Factors, Occupancy, "
+               "NumCUs, Granularity, BW_EFF, KernelName"
+            << std::endl;
+
+    for(auto& info : bench_infos_vec)
+    {
+        outfile << info.SSN << "," << info.prob_token << "," << info.milli_seconds << ","
+                << info.gflops << "," << info.num_blocks << "," << info.workgroup_size << ","
+                << info.threads_per_trans << "," << info.trans_per_block << "," << info.LDS_bytes
+                << ","
+                << "\"" << info.util_rate << "\""
+                << ","
+                << "\"" << info.factors_str << "\""
+                << "," << info.occupancy << "," << info.numCUs << "," << info.granularity << ","
+                << info.bw_eff << "," << info.kernel_name << std::endl;
+    }
+
+    outfile.close();
+
+    return true;
+}
+
+bool TuningBenchmarker::MergingSolutionsMaps(const std::string& base_map_path,
+                                             const std::string& new_map_path,
+                                             const std::string& probKeyStr,
+                                             const std::string& out_map_path)
+{
+    // read the existing solutions to primary map, primary = true
+    if(binding_solution_map->read_solution_map_data(base_map_path) == false)
+        return false;
+
+    static const std::string sep = ":";
+
+    try
+    {
+        std::string token = probKeyStr;
+        size_t      pos   = token.find(sep);
+        if(pos == std::string::npos)
+            throw std::runtime_error(probKeyStr
+                                     + " is an inccorect probKeyStr format. Please pass probKeyStr "
+                                       "as \"arch:probToken\"");
+
+        std::string arch = token.substr(0, pos);
+        token.erase(0, pos + 1);
+        std::vector<ProblemKey> merging_problems = {ProblemKey(arch, token)};
+
+        // read mering-solutions from new file and merge to primary map
+        if(binding_solution_map->merge_solutions_from_file(new_map_path, merging_problems))
+            // output to the merged map, sort = true, output primary = true
+            return binding_solution_map->write_solution_map_data(out_map_path);
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << e.what() << std::endl;
+    }
+
+    return false;
+}
\ No newline at end of file
diff -Nru rocfft-5.5.0/library/src/tuning_kernel_tuner.cpp rocfft-5.7.1/library/src/tuning_kernel_tuner.cpp
--- rocfft-5.5.0/library/src/tuning_kernel_tuner.cpp	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/tuning_kernel_tuner.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,671 @@
+// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "tuning_kernel_tuner.h"
+#include "../../shared/arithmetic.h"
+#include "../../shared/environment.h"
+#include "function_pool.h"
+#include "logging.h"
+#include "rocfft.h"
+#include "solution_map.h"
+#include "tuning_helper.h"
+#include "twiddles.h"
+
+#include <iterator>
+#include <random>
+#include <set>
+
+static const char* candidates_folder = "TuningCandidates";
+
+static const std::vector<size_t> supported_factors = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 16, 17};
+
+// TODO- support half precision
+static const size_t LDS_BYTE_LIMIT   = 32 * 1024;
+static const size_t BYTES_PER_FLOAT2 = 8;
+static const size_t BYTES_PER_FLOAT4 = 16;
+
+// use_ltwd_3steps: if use_ltwd_3steps and ltwd_base < 8, then ltwd table will take some lds ,
+// tpt: threads_per_transform
+// wgs_bound: upper bound of workgroup_size
+// return: transfroms_per_block: maximal value within LDS_LIMIT
+size_t DeriveMaxTPB(size_t length,
+                    bool   is_single,
+                    bool   half_lds,
+                    bool   use_ltwd_3steps,
+                    size_t large1D,
+                    size_t tpt,
+                    size_t wgs_bound)
+{
+    size_t bytes_per_elem  = (is_single) ? BYTES_PER_FLOAT2 : BYTES_PER_FLOAT4;
+    size_t bytes_per_batch = length * bytes_per_elem;
+
+    if(half_lds)
+        bytes_per_batch /= 2;
+
+    if(use_ltwd_3steps)
+    {
+        size_t ltwd_base, ltwd_steps;
+        get_large_twd_base_steps(large1D, use_ltwd_3steps, ltwd_base, ltwd_steps);
+        // only in this condition we put the ltwd table in lds, and the
+        // #elem = (1 << base) * 3
+        if(ltwd_base < 8)
+            bytes_per_batch += ((1 << ltwd_base) * 3) * bytes_per_elem;
+    }
+
+    size_t tpb = LDS_BYTE_LIMIT / bytes_per_batch;
+    while(tpt * tpb > wgs_bound)
+        --tpb;
+
+    // this value is not returned
+    // wgs_bound = tpt * tpb;
+
+    return tpb;
+}
+
+size_t ConservativeMaxTPB(size_t length, bool is_single)
+{
+    size_t bytes_per_elem  = (is_single) ? BYTES_PER_FLOAT2 : BYTES_PER_FLOAT4;
+    size_t bytes_per_batch = length * bytes_per_elem;
+
+    // [reduce search space]:
+    //  theoretically, using half_lds can make the tpb double,
+    //  but from observation, in the EDGE case that tpb EXACTLY fits in LDS_BYTE_LIMIT, (occu = 2)
+    //  enable half_lds can double the tpb to make lds fits in LDS_BYTE_LIMIT, (occu = 2)
+    //  but it's very likely making the occu down to 1
+    //  so a conservation way is still to use a non-half-lds TPB value as the upper_bound
+
+    // So we can discard any configuration with TPB > bound, even for half_lds
+    size_t conservative_max_tpb = LDS_BYTE_LIMIT / bytes_per_batch;
+
+    if(length >= 1024)
+        conservative_max_tpb += 1;
+
+    return conservative_max_tpb;
+}
+
+// recursively find all unique factorizations of given length.  each
+// factorization is a vector of ints, sorted so they're uniquified in
+// a set.
+std::set<std::vector<size_t>> Factorize(size_t length)
+{
+    std::set<std::vector<size_t>> ret;
+    for(auto factor : supported_factors)
+    {
+        if(length % factor == 0)
+        {
+            size_t remain = length / factor;
+            if(remain == 1)
+                ret.insert({factor});
+            else
+            {
+                // recurse into remainder
+                auto remain_factorization = Factorize(remain);
+                for(auto& remain_factors : remain_factorization)
+                {
+                    std::vector<size_t> factors{factor};
+                    std::copy(
+                        remain_factors.begin(), remain_factors.end(), std::back_inserter(factors));
+                    std::sort(factors.begin(), factors.end());
+                    ret.insert(factors);
+                }
+            }
+        }
+    }
+    return ret;
+}
+
+size_t GetMaxRadicesSize(const std::set<std::vector<size_t>>& all_factors_set)
+{
+    size_t min_size = TWIDDLES_MAX_RADICES + 1;
+
+    for(auto factors : all_factors_set)
+    {
+        min_size = std::min(factors.size(), min_size);
+    }
+
+    // don't try kernels with too many radices
+    return min_size + 2;
+    // return min_size + 3;
+}
+
+// recursively return power set of a range of ints
+std::set<std::vector<size_t>> PowerSet(std::vector<size_t>::const_iterator begin,
+                                       std::vector<size_t>::const_iterator end)
+{
+    std::set<std::vector<size_t>> ret;
+    // either include the front element in the output, or don't
+    if(std::distance(begin, end) == 1)
+    {
+        ret.insert({*begin});
+        ret.insert({});
+    }
+    else
+    {
+        // recurse into the remainder
+        auto remain = PowerSet(begin + 1, end);
+        for(auto r : remain)
+        {
+            ret.insert(r);
+            r.push_back(*begin);
+            ret.insert(r);
+        }
+    }
+    return ret;
+}
+
+std::set<size_t> SupportedThreadsPerTransform(const std::vector<size_t>& factorization)
+{
+    std::set<size_t> tpts;
+    auto             tpt_candidates = PowerSet(factorization.begin(), factorization.end());
+    for(auto tpt : tpt_candidates)
+    {
+        if(tpt.empty())
+            continue;
+        size_t product = std::accumulate(tpt.begin(), tpt.end(), 1, std::multiplies<size_t>());
+        tpts.insert(product);
+    }
+    return tpts;
+}
+
+std::vector<double>
+    GetUtilizationRate(size_t length, const std::vector<size_t>& factors, size_t tpt)
+{
+    std::vector<double> ret; // [height_1, height_2,..., height_n, average]
+
+    double util_rate = 0;
+    for(auto width : factors)
+    {
+        double height = static_cast<double>(length) / width / tpt;
+        ret.push_back(height);
+        util_rate += height;
+    }
+    util_rate /= factors.size();
+    ret.push_back(util_rate);
+
+    return ret;
+}
+
+void PrintRejectionMsg(const std::string& msg, bool print)
+{
+    if(LOG_TUNING_ENABLED() && print)
+        (*LogSingleton::GetInstance().GetTuningOS()) << msg;
+}
+
+std::string FactorsToString(const std::vector<size_t>& factors)
+{
+    // factors as string
+    std::string factors_str = "[";
+    std::string COMMA       = "";
+    for(auto factor : factors)
+    {
+        factors_str += COMMA + std::to_string(factor);
+        COMMA = ", ";
+    }
+    factors_str += "]";
+
+    return factors_str;
+}
+
+// [reduce search space]
+// using permutation or shifting...
+std::set<std::vector<size_t>>
+    GetTotalFactorizationsForPhase1(size_t                         node_id,
+                                    std::set<std::vector<size_t>>& all_umpermuted_factors)
+{
+    std::set<std::vector<size_t>> ret;
+
+    auto& target_factors = TuningBenchmarker::GetSingleton().GetPacket()->target_factors;
+    assert(!target_factors.empty());
+
+    for(auto factorization : all_umpermuted_factors)
+    {
+        std::string factor_str = FactorsToString(factorization);
+        // this is not our target factorization in phase 1, ignore it
+        if(target_factors[node_id].count(factor_str) == 0)
+            continue;
+
+        std::vector<size_t>& good_factors = factorization;
+        // std::cout << "for um-permuted good_factor: " << FactorsToString(good_factors) << std::endl;
+
+        // try to peek the permutation results, if the num-of-permu is too large
+        // we might want to cut down the candidates.
+        std::vector<std::vector<size_t>> permutations;
+
+        std::vector<size_t> permuting_factors = good_factors;
+        while(std::next_permutation(permuting_factors.begin(), permuting_factors.end()))
+        {
+            permutations.push_back(permuting_factors);
+            // std::cout << "get permutation: " << FactorsToString(permuting_factors) << std::endl;
+        }
+        // std::cout << "generated " << permutations.size() << " permutaions\n" << std::endl;
+
+        if(permutations.size() > 6)
+        {
+            // std::cout << "too many permutaions: " << permutations.size() << ", try use shifting. "
+            //           << std::endl;
+            permutations.clear();
+
+            size_t              factor_len = good_factors.size();
+            std::vector<size_t> reversed   = good_factors; // [a,b,c,...]
+            std::reverse(reversed.begin(), reversed.end()); // [...,c,b,a]
+
+            good_factors.insert(good_factors.end(),
+                                good_factors.begin(),
+                                good_factors.end()); // [a,b,c,....] -> [a,b,c,..,a,b,c...]
+            reversed.insert(reversed.end(),
+                            reversed.begin(),
+                            reversed.end()); // [...,c,b,a] -> [...,c,b,a,..,c,b,a]
+            for(size_t i = 0; i < factor_len; ++i)
+            {
+                std::vector<size_t> shifted_ori(good_factors.begin() + i,
+                                                good_factors.begin() + i + factor_len);
+                std::vector<size_t> shifted_rev(reversed.begin() + i,
+                                                reversed.begin() + i + factor_len);
+                permutations.push_back(shifted_ori);
+                permutations.push_back(shifted_rev);
+                // std::cout << "get shifted: " << FactorsToString(shifted_ori) << std::endl;
+                // std::cout << "get shifted: " << FactorsToString(shifted_rev) << std::endl;
+            }
+            // std::cout << "generated " << permutations.size() << " shifting\n" << std::endl;
+        }
+
+        std::copy(permutations.begin(), permutations.end(), std::inserter(ret, ret.end()));
+    }
+
+    // std::cout << "all factors in phase 1:" << std::endl;
+    // for(auto it : ret)
+    //     std::cout << FactorsToString(it) << std::endl;
+
+    return ret;
+}
+
+std::set<KernelConfig> SupportedKernelConfigs(size_t length,
+                                              size_t node_id,
+                                              bool   is_single,
+                                              bool   is_sbcc,
+                                              bool   is_sbrc,
+                                              bool   is_sbcr,
+                                              size_t large1D)
+{
+    std::set<KernelConfig> configs;
+
+    auto   factorizations   = Factorize(length);
+    size_t max_radices_size = GetMaxRadicesSize(factorizations);
+    bool   has_ltwd_mul     = is_sbcc && (large1D > 0);
+    // so far our kernel-gen implements intrinsic mode only on these two type
+    bool   can_do_intrinsic = is_sbcc || is_sbcr;
+    size_t conservative_tpb = ConservativeMaxTPB(length, is_single);
+
+    // [reduce search space]:
+    // we will remove those configs(A) with tpt = length,
+    // and remove other configs having the same tpb as configs(A)
+    std::set<size_t> tpbs_to_remove;
+    std::set<size_t> all_tpts;
+
+    bool        print_reject = !rocfft_getenv("PRINT_REJECT_REASON").empty();
+    std::string min_wgs_str  = rocfft_getenv("MIN_WGS");
+    std::string max_wgs_str  = rocfft_getenv("MAX_WGS");
+    size_t      min_wgs      = min_wgs_str.empty() ? 64 : std::atoi(min_wgs_str.c_str());
+    size_t      max_wgs      = max_wgs_str.empty() ? 512 : std::atoi(max_wgs_str.c_str());
+
+    // if min_wgs is greater than length, then we lower it.
+    min_wgs = (length < min_wgs) ? length : min_wgs;
+    min_wgs = (min_wgs % 64 == 0) ? min_wgs : std::max((size_t)0, min_wgs - (min_wgs % 64));
+    max_wgs = (max_wgs % 64 == 0) ? max_wgs : max_wgs - (max_wgs % 64);
+
+    auto& target_factors = TuningBenchmarker::GetSingleton().GetPacket()->target_factors;
+    bool  is_phase0      = (target_factors.empty());
+    bool  no_permutation = is_phase0;
+
+    // avoid 336 from expanding to 5 factors
+    if(length == 336)
+        --max_radices_size;
+
+    if(is_phase0 == false)
+    {
+        // no_permutation might always be 0...
+        no_permutation = true;
+        factorizations = GetTotalFactorizationsForPhase1(node_id, factorizations);
+    }
+
+    for(auto factorization : factorizations)
+    {
+        // [reduce search space]:
+        // don't try kernels with too many radices
+        if(is_phase0 && (factorization.size() > max_radices_size))
+        {
+            PrintRejectionMsg("reject: using too many radices in factors\n", print_reject);
+            continue;
+        }
+
+        auto tpts = SupportedThreadsPerTransform(factorization);
+
+        // [reduce search space]
+        // by utilization rate
+        for(auto tpt = tpts.begin(), last = tpts.end(); tpt != last;)
+        {
+            std::vector<double> util_rates = GetUtilizationRate(length, factorization, *tpt);
+
+            auto max_rates = std::max_element(util_rates.begin(), util_rates.end());
+            auto avg_rate  = util_rates.back();
+            // if average rate < 1.0 or > 8.0 , or any of heights > 8.0, then it's bad
+            if(avg_rate < 1.0 || *max_rates > 8.0)
+            {
+                if(avg_rate < 1.0)
+                    PrintRejectionMsg(
+                        "reject: small util_avg_rate (< 1): " + std::to_string(avg_rate) + "\n",
+                        print_reject);
+                else
+                    PrintRejectionMsg(
+                        "reject: existng an util_rate (> 8): " + std::to_string(*max_rates) + "\n",
+                        print_reject);
+                tpt = tpts.erase(tpt);
+            }
+            else
+                ++tpt;
+        }
+
+        // go through all permutations of the factors
+        do
+        {
+            for(size_t wgs = min_wgs; wgs <= max_wgs; wgs += 64)
+            {
+                for(const auto tpt : tpts)
+                {
+                    if(tpt < wgs)
+                    {
+                        for(bool half_lds : {true, false})
+                        {
+                            if(half_lds && (is_sbcr || is_sbrc))
+                            {
+                                PrintRejectionMsg("reject: only sbrr and sbcc support half-lds\n",
+                                                  print_reject);
+                                continue;
+                            }
+
+                            for(bool use_ltwd_3steps : {true, false})
+                            {
+                                // skip ltwd_3steps if not needed
+                                if(!has_ltwd_mul && use_ltwd_3steps)
+                                    continue;
+
+                                // Get the actual value of wgs and trans_per_block
+                                size_t max_tpb = DeriveMaxTPB(length,
+                                                              is_single,
+                                                              half_lds,
+                                                              use_ltwd_3steps,
+                                                              large1D,
+                                                              tpt,
+                                                              wgs);
+
+                                // this tpt and tpb will be reject
+                                if(tpt == length)
+                                    tpbs_to_remove.insert(max_tpb);
+
+                                // [reduce search space]
+                                size_t num_tpb_try = (tpt * max_tpb == wgs) ? 1 : 2;
+                                for(size_t t = 0; t < num_tpb_try; ++t)
+                                {
+                                    size_t tpb       = max_tpb + t;
+                                    size_t final_wgs = tpt * tpb;
+                                    if(final_wgs > max_wgs)
+                                        continue;
+                                    if(final_wgs <= (wgs - 64))
+                                        continue;
+
+                                    // [reduce search space]
+                                    if(tpb > conservative_tpb)
+                                    {
+                                        PrintRejectionMsg("reject: tpb > conservation max tpb\n",
+                                                          print_reject);
+                                        continue;
+                                    }
+
+                                    // [reduce search space]:
+                                    // prune some bad configurations
+                                    if(length >= 64 && final_wgs < 64)
+                                    {
+                                        PrintRejectionMsg("reject: (length >= 64) and (wgs < 64)\n",
+                                                          print_reject);
+                                        continue;
+                                    }
+                                    if(IsPo2(length) && (length % final_wgs != 0))
+                                    {
+                                        PrintRejectionMsg("reject: require wgs divisable to length "
+                                                          "for Pow2 length\n",
+                                                          print_reject);
+                                        continue;
+                                    }
+
+                                    // [reduce search space]
+                                    // from current benchmark result, dir-reg mode always ranks high
+                                    for(bool direct_to_from_reg : {true /*, false*/})
+                                    {
+                                        // half lds currently requires direct to/from reg
+                                        if(half_lds && !direct_to_from_reg)
+                                        {
+                                            PrintRejectionMsg(
+                                                "reject: half_lds requires direct to/from reg\n",
+                                                print_reject);
+                                            continue;
+                                        }
+
+                                        for(bool intrinsic : {true, false})
+                                        {
+                                            // intrinsic currently requires direct to/from reg
+                                            if(intrinsic && !direct_to_from_reg)
+                                            {
+                                                PrintRejectionMsg("reject: intrinsic requires "
+                                                                  "direct to/from reg\n",
+                                                                  print_reject);
+                                                continue;
+                                            }
+
+                                            // intrinsic currently is supported on sbcc/sbcr
+                                            if(intrinsic && !can_do_intrinsic)
+                                            {
+                                                PrintRejectionMsg("reject: intrinsic mode is "
+                                                                  "supported only on sbcc/sbcr\n",
+                                                                  print_reject);
+                                                continue;
+                                            }
+
+                                            all_tpts.insert(tpt);
+
+                                            KernelConfig config;
+                                            config.direct_to_from_reg    = direct_to_from_reg;
+                                            config.intrinsic_buffer_inst = intrinsic;
+                                            config.half_lds              = half_lds;
+                                            config.threads_per_transform = {(int)tpt, 0};
+                                            config.transforms_per_block  = tpb;
+                                            config.workgroup_size        = final_wgs;
+                                            config.use_3steps_large_twd  = use_ltwd_3steps;
+                                            config.factors               = factorization;
+
+                                            configs.insert(config);
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        } while(no_permutation == false
+                && std::next_permutation(factorization.begin(), factorization.end()));
+    }
+
+    // [reduce search space]
+    // if we have other options than tpt == length,
+    // then we can remove all configs with tpt==length
+    if(all_tpts.size() >= 2 && tpbs_to_remove.size() > 0)
+    {
+        for(auto config = configs.begin(), last = configs.end(); config != last;)
+        {
+            // remove bad tpt
+            if(config->threads_per_transform[0] == (int)length)
+            {
+                PrintRejectionMsg("reject: tpt == length\n" + config->Print() + "\n\n",
+                                  print_reject);
+                config = configs.erase(config);
+            }
+            // remove bad tpbs
+            else if(tpbs_to_remove.count((size_t)config->transforms_per_block) > 0)
+            {
+                PrintRejectionMsg("reject: tpb is considered bad\n" + config->Print() + "\n\n",
+                                  print_reject);
+                config = configs.erase(config);
+            }
+            else
+                ++config;
+        }
+        all_tpts.erase(length);
+    }
+
+    // [reduce search space]
+    // we can remove the largest 33% tpt, since they are always in low perf.
+    if(all_tpts.size() > 0)
+    {
+        size_t num_tpts_to_remove = (all_tpts.size() - 1) / 2;
+        if(num_tpts_to_remove > 0)
+        {
+            std::set<size_t>    tpts_to_remove;
+            std::vector<size_t> tpts_vec(all_tpts.begin(), all_tpts.end());
+            std::sort(tpts_vec.begin(), tpts_vec.end());
+            // the largest #-num_tpts_to_remove tpts will be removed
+            for(size_t i = 0; i < num_tpts_to_remove; ++i)
+            {
+                tpts_to_remove.insert(tpts_vec.back());
+                tpts_vec.pop_back();
+            }
+
+            for(auto config = configs.begin(), last = configs.end(); config != last;)
+            {
+                // remove bad tpt
+                if(tpts_to_remove.count((size_t)(config->threads_per_transform[0])) > 0)
+                {
+                    PrintRejectionMsg("reject: tpt is considered bad\n" + config->Print() + "\n\n",
+                                      print_reject);
+                    config = configs.erase(config);
+                }
+                else
+                    ++config;
+            }
+        }
+    }
+
+    return configs;
+}
+
+void EnumerateKernelConfigs(const ExecPlan& execPlan)
+{
+    auto        tuningPacket = TuningBenchmarker::GetSingleton().GetPacket();
+    std::string archName     = get_arch_name(execPlan.deviceProp);
+
+    tuningPacket->tuning_arch_name = archName;
+    tuningPacket->numCUs           = execPlan.deviceProp.multiProcessorCount;
+    tuningPacket->total_nodes      = execPlan.execSeq.size();
+    tuningPacket->total_candidates.resize(tuningPacket->total_nodes);
+    tuningPacket->tuning_kernel_tokens.resize(tuningPacket->total_nodes);
+    tuningPacket->is_builtin_kernel.resize(tuningPacket->total_nodes);
+
+    // get kernel_config permutation for each node
+    std::string kernel_token;
+    for(size_t node_id = 0; node_id < execPlan.execSeq.size(); node_id++)
+    {
+        // TODO- 2D tuning
+        size_t bench_ssn = 0;
+        bool   check_dup = false;
+        size_t len       = execPlan.execSeq[node_id]->length[0];
+        bool   is_single = (execPlan.rootPlan->precision == rocfft_precision_single);
+        bool   is_sbcc   = (execPlan.execSeq[node_id]->scheme == CS_KERNEL_STOCKHAM_BLOCK_CC);
+        bool   is_sbrc   = (execPlan.execSeq[node_id]->scheme == CS_KERNEL_STOCKHAM_BLOCK_RC);
+        bool   is_sbcr   = (execPlan.execSeq[node_id]->scheme == CS_KERNEL_STOCKHAM_BLOCK_CR);
+        size_t large1D   = execPlan.execSeq[node_id]->large1D;
+        auto   base_key  = execPlan.execSeq[node_id]->GetKernelKey();
+
+        // if this kernel is an internal built-in one, we are not tunining it yet, (transpose..etc)
+        // but we will plan to tune it in the future.
+        if(base_key == EmptyFMKey)
+        {
+            check_dup = true;
+            ProblemKey built_in_kernel_key(archName, solution_map::KERNEL_TOKEN_BUILTIN_KERNEL);
+            TuningBenchmarker::GetSingleton().GetBindingSolutionMap()->add_solution(
+                built_in_kernel_key, EmptyFMKey, check_dup);
+
+            tuningPacket->tuning_kernel_tokens[node_id] = solution_map::KERNEL_TOKEN_BUILTIN_KERNEL;
+            tuningPacket->is_builtin_kernel[node_id]    = true;
+            tuningPacket->total_candidates[node_id]     = bench_ssn;
+            continue;
+        }
+
+        // In init stage, save the default kernel token (without extra info)
+        GetKernelToken(base_key, kernel_token);
+        tuningPacket->tuning_kernel_tokens[node_id] = kernel_token;
+        tuningPacket->is_builtin_kernel[node_id]    = false;
+
+        // modify the token: append extra candidate info
+        kernel_token += "_leafnode_" + std::to_string(node_id);
+        kernel_token += "_phase_" + std::to_string(tuningPacket->tuning_phase);
+        ProblemKey probKey_kernel(archName, kernel_token);
+
+        // enumerate !
+        auto kernel_configs
+            = SupportedKernelConfigs(len, node_id, is_single, is_sbcc, is_sbrc, is_sbcr, large1D);
+        for(const auto& config : kernel_configs)
+        {
+            // NB:
+            //  add_solution will append a solution to the solution-vec under the probKey
+            FMKey alt_key = get_alternative_FMKey(base_key, config);
+
+            // NB:
+            //     A very important part for SBRCTransis that: !!!!
+            //     the SBRC-Trans-Type in the BaseKey of the default kernel is not always right
+            //     for all the configurations. Since TPB is changed, so we should also update
+            //     the SBRC-Trans-Type according to config and node dimenstion.
+            SBRC_TRANSPOSE_TYPE sbrcType
+                = execPlan.execSeq[node_id]->sbrc_transpose_type(config.transforms_per_block);
+            std::get<3>(alt_key) = sbrcType;
+
+            // when init tuning: output bunches of candidate kernels,
+            // actually we need to check if there is an duplcation, but in this case it'll never happen
+            // and the return option-id is irrelavent here. so check_dup=false, primary=true
+            TuningBenchmarker::GetSingleton().GetBindingSolutionMap()->add_solution(
+                probKey_kernel, alt_key, check_dup);
+            bench_ssn++;
+        }
+        tuningPacket->total_candidates[node_id] = bench_ssn;
+    }
+
+    if(tuningPacket->dump_candidates)
+    {
+        std::string filename         = tuningPacket->tuning_problem_name + "_tuning.dat";
+        std::string workspace_folder = "";
+        workspace_folder             = rocfft_getenv("TUNING_WORKSPACE");
+
+        fs::path dump_path(workspace_folder.c_str());
+        dump_path /= candidates_folder;
+        dump_path /= filename.c_str();
+
+        // dump the solution candidates to file (from primary_map), sort=false
+        TuningBenchmarker::GetSingleton().GetBindingSolutionMap()->write_solution_map_data(
+            dump_path, false);
+    }
+}
\ No newline at end of file
diff -Nru rocfft-5.5.0/library/src/tuning_plan_tuner.cpp rocfft-5.7.1/library/src/tuning_plan_tuner.cpp
--- rocfft-5.5.0/library/src/tuning_plan_tuner.cpp	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/library/src/tuning_plan_tuner.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,122 @@
+// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "tuning_plan_tuner.h"
+#include "solution_map.h"
+#include "tuning_helper.h"
+#include "tuning_kernel_tuner.h"
+
+#include <iterator>
+#include <random>
+#include <set>
+#include <unordered_set>
+
+// Some problems are not supported yet.
+static const std::set<ComputeScheme> supported_prob_schemes = {CS_KERNEL_STOCKHAM,
+                                                               CS_L1D_TRTRT,
+                                                               CS_L1D_CC,
+                                                               CS_L1D_CRT,
+                                                               CS_2D_RTRT,
+                                                               CS_2D_RC,
+                                                               CS_3D_TRTRTR,
+                                                               CS_3D_RTRT,
+                                                               CS_3D_BLOCK_RC,
+                                                               CS_3D_BLOCK_CR,
+                                                               CS_3D_RC};
+
+// return size_t: the "option_id" of the return node in its sol-vector
+size_t SerializeTree(TreeNode* node, std::string& archName)
+{
+    std::vector<SolutionPtr> child_nodes;
+    std::string              min_token, full_token;
+
+    // if node is internal node, then it has childrens
+    for(const auto& c : node->childNodes)
+    {
+        GetNodeToken(*(c.get()), min_token, full_token);
+
+        size_t child_option = SerializeTree(c.get(), archName);
+        child_nodes.push_back({min_token, child_option});
+    }
+    // if node is a leaf node without child-nodes, then the SOL_LEAF_NODE
+    // should have an only childnode that is SOL_KERNEL_ONLY
+    if(node->nodeType == NT_LEAF)
+    {
+        auto kernel_key = node->GetKernelKey();
+
+        if(kernel_key == EmptyFMKey)
+            min_token = solution_map::KERNEL_TOKEN_BUILTIN_KERNEL;
+        else
+            GetKernelToken(kernel_key, min_token);
+
+        // the option-id is irrevalent since we will modify it during tuning
+        child_nodes.push_back({min_token, 0});
+    }
+
+    GetNodeToken(*node, min_token, full_token);
+    ProblemKey problemKey(archName, min_token);
+
+    // Add a solution to primary map (as candidates):
+    //   if SOL_INTERNAL_NODE --> childrens = decomposition
+    //   if SOL_LEAF_NODE     --> childrens = one kernel-node
+    // check_dup=false, primiary_map=true
+    size_t my_option_id = TuningBenchmarker::GetSingleton().GetBindingSolutionMap()->add_solution(
+        problemKey, node, child_nodes, node->isRootNode(), false);
+
+    // save the problem name;
+    if(node->isRootNode())
+        TuningBenchmarker::GetSingleton().GetPacket()->tuning_problem_name = min_token;
+
+    return my_option_id;
+}
+
+void EnumerateTrees(ExecPlan& execPlan)
+{
+    std::string archName = get_arch_name(execPlan.deviceProp);
+
+    // TODO- plan-tuning: build tree several times to generate different trees
+    {
+        execPlan.rootPlan->RecursiveBuildTree();
+
+        // Haven't supported type (real, bluestein...), return directly.
+        // And tuner knows to skip work by testing "packet->total_nodes == 0"
+        if(supported_prob_schemes.count(execPlan.rootPlan->scheme) == 0)
+            return;
+
+        assert(execPlan.rootPlan->length.size() == execPlan.rootPlan->dimension);
+        assert(execPlan.rootPlan->length.size() == execPlan.rootPlan->inStride.size());
+        assert(execPlan.rootPlan->length.size() == execPlan.rootPlan->outStride.size());
+
+        execPlan.rootPlan->CollectLeaves(execPlan.execSeq, execPlan.fuseShims);
+
+        // don't need to do SantiyCheck and KernelCheck now, since they are checking if
+        // the kernels exist in function_pool which is not always true for RTC and tuning
+        // execPlan.rootPlan->SanityCheck(rootScheme, execPlan.solution_kernels);
+
+        if(TuningBenchmarker::GetSingleton().GetPacket()->tuning_phase == 0)
+        {
+            // Adding decompoistion solutions from this tree-decomposition
+            SerializeTree(execPlan.rootPlan.get(), archName);
+        }
+
+        // Adding kernel candidates
+        EnumerateKernelConfigs(execPlan);
+    }
+}
\ No newline at end of file
diff -Nru rocfft-5.5.0/library/src/twiddles.cpp rocfft-5.7.1/library/src/twiddles.cpp
--- rocfft-5.5.0/library/src/twiddles.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/library/src/twiddles.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -1,5 +1,5 @@
 /******************************************************************************
-* Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+* Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -22,9 +22,12 @@
 
 #include "twiddles.h"
 #include "../../shared/arithmetic.h"
-#include "device/kernels/twiddle_factors.h"
+#include "../../shared/hipstream_wrapper.h"
+#include "../../shared/rocfft_hip.h"
 #include "function_pool.h"
-#include "rocfft_hip.h"
+#include "rtc_cache.h"
+#include "rtc_kernel.h"
+#include "rtc_twiddle_kernel.h"
 #include <cassert>
 #include <math.h>
 #include <numeric>
@@ -32,39 +35,6 @@
 #include <string>
 #include <tuple>
 
-// RAII wrapper around hipStream_t
-struct hipStream_wrapper_t
-{
-    hipStream_wrapper_t()
-        : stream(nullptr)
-    {
-    }
-    void alloc()
-    {
-        if(stream == nullptr && hipStreamCreate(&stream) != hipSuccess)
-            throw std::runtime_error("hipStreamCreate failure");
-    }
-    operator hipStream_t()
-    {
-        return stream;
-    }
-    ~hipStream_wrapper_t()
-    {
-        if(stream)
-            (void)hipStreamDestroy(stream);
-    }
-    hipStream_wrapper_t(const hipStream_wrapper_t&) = delete;
-    hipStream_wrapper_t& operator=(const hipStream_wrapper_t&) = delete;
-    hipStream_wrapper_t(hipStream_wrapper_t&& other)
-        : stream(other.stream)
-    {
-        other.stream = nullptr;
-    }
-
-private:
-    hipStream_t stream;
-};
-
 // this vector stores streams for each device id.  index in the
 // vector is device id.  note that this vector needs to be protected
 // against concurrent access, but twiddles are always accessed
@@ -92,6 +62,9 @@
     // what we need.
     bool attach_halfN;
 
+    const rocfft_precision precision;
+    const std::string      gpu_arch;
+
     void GetKernelParams(const std::vector<size_t>& radices,
                          std::vector<size_t>&       radices_prod,
                          std::vector<size_t>&       radices_sum_prod,
@@ -154,45 +127,14 @@
             throw std::runtime_error("unable to allocate twiddle length "
                                      + std::to_string(total_length));
 
-        auto num_radices = radices.size();
-
-        auto blockSize  = TWIDDLES_THREADS;
-        auto numBlocksX = DivRoundingUp<size_t>(num_radices, blockSize);
-        auto numBlocksY = DivRoundingUp<size_t>(maxElem / minElem, blockSize);
-
         auto device_data_ptr = static_cast<T*>(output.data());
 
-        radices_t radices_device;
-        radices_t radices_prod_device;
-        radices_t radices_sum_prod_device;
-        std::copy(radices.begin(), radices.end(), radices_device.data);
-        std::copy(radices_prod.begin(), radices_prod.end(), radices_prod_device.data);
-        std::copy(radices_sum_prod.begin(), radices_sum_prod.end(), radices_sum_prod_device.data);
-
-        hipLaunchKernelGGL(GenerateTwiddleTableKernel<T>,
-                           dim3(numBlocksX, numBlocksY),
-                           dim3(blockSize, blockSize),
-                           0, // sharedMemBytes
-                           stream,
-                           length_limit,
-                           num_radices,
-                           radices_device,
-                           radices_prod_device,
-                           radices_sum_prod_device,
-                           device_data_ptr);
+        launch_radices_kernel(
+            radices, radices_prod, radices_sum_prod, maxElem, minElem, stream, device_data_ptr);
 
         if(attach_halfN)
         {
-            auto numBlocks_halfN = DivRoundingUp<size_t>(half_N, blockSize);
-
-            hipLaunchKernelGGL(GenerateHalfNTableKernel<T>,
-                               dim3(numBlocks_halfN),
-                               dim3(blockSize),
-                               0, // sharedMemBytes
-                               stream,
-                               half_N,
-                               N,
-                               device_data_ptr + table_sz);
+            launch_half_N_kernel(stream, device_data_ptr + table_sz);
         }
     }
 
@@ -214,35 +156,78 @@
 
         auto device_data_ptr = static_cast<T*>(output.data());
 
-        hipLaunchKernelGGL(GenerateTwiddleTableKernel<T>,
-                           dim3(numBlocks),
-                           dim3(blockSize),
-                           0, // sharedMemBytes
-                           stream,
-                           length_limit,
-                           N,
-                           device_data_ptr);
+        auto kernel = RTCKernelTwiddle::generate(gpu_arch, TwiddleTableType::LENGTH_N, precision);
+        RTCKernelArgs kargs;
+        kargs.append_size_t(length_limit);
+        kargs.append_size_t(N);
+        kargs.append_ptr(device_data_ptr);
+
+        kernel.launch(kargs, dim3(numBlocks), dim3(blockSize), 0, stream);
 
         if(attach_halfN)
         {
-            auto numBlocks_halfN = DivRoundingUp<size_t>(half_N, blockSize);
-
-            hipLaunchKernelGGL(GenerateHalfNTableKernel<T>,
-                               dim3(numBlocks_halfN),
-                               dim3(blockSize),
-                               0, // sharedMemBytes
-                               stream,
-                               half_N,
-                               N,
-                               device_data_ptr + length);
+            launch_half_N_kernel(stream, device_data_ptr + length);
         }
     }
 
+    void launch_radices_kernel(const std::vector<size_t>& radices,
+                               std::vector<size_t>&       radices_prod,
+                               std::vector<size_t>&       radices_sum_prod,
+                               size_t                     maxElem,
+                               size_t                     minElem,
+                               hipStream_t&               stream,
+                               T*                         output)
+    {
+        auto num_radices = radices.size();
+
+        auto blockSize  = TWIDDLES_THREADS;
+        auto numBlocksX = DivRoundingUp<size_t>(num_radices, blockSize);
+        auto numBlocksY = DivRoundingUp<size_t>(maxElem / minElem, blockSize);
+
+        radices_t radices_device;
+        radices_t radices_prod_device;
+        radices_t radices_sum_prod_device;
+        std::copy(radices.begin(), radices.end(), radices_device.data);
+        std::copy(radices_prod.begin(), radices_prod.end(), radices_prod_device.data);
+        std::copy(radices_sum_prod.begin(), radices_sum_prod.end(), radices_sum_prod_device.data);
+
+        auto kernel = RTCKernelTwiddle::generate(gpu_arch, TwiddleTableType::RADICES, precision);
+        RTCKernelArgs kargs;
+        kargs.append_size_t(length_limit);
+        kargs.append_size_t(num_radices);
+        kargs.append_struct(radices_device);
+        kargs.append_struct(radices_prod_device);
+        kargs.append_struct(radices_sum_prod_device);
+        kargs.append_ptr(output);
+        kernel.launch(kargs, dim3(numBlocksX, numBlocksY), dim3(blockSize, blockSize), 0, stream);
+    }
+
+    void launch_half_N_kernel(hipStream_t& stream, T* output)
+    {
+        auto blockSize = TWIDDLES_THREADS;
+
+        auto kernel = RTCKernelTwiddle::generate(gpu_arch, TwiddleTableType::HALF_N, precision);
+        RTCKernelArgs kargs;
+        kargs.append_size_t(half_N);
+        kargs.append_size_t(N);
+        kargs.append_ptr(output);
+
+        auto numBlocks_halfN = DivRoundingUp<size_t>(half_N, blockSize);
+
+        kernel.launch(kargs, dim3(numBlocks_halfN), dim3(blockSize), 0, stream);
+    }
+
 public:
-    TwiddleTable(size_t _N, size_t _length_limit, bool _attach_halfN)
+    TwiddleTable(rocfft_precision   precision,
+                 const std::string& gpu_arch,
+                 size_t             _N,
+                 size_t             _length_limit,
+                 bool               _attach_halfN)
         : N(_N)
         , length_limit(_length_limit ? _length_limit : _N)
         , attach_halfN(_attach_halfN)
+        , precision(precision)
+        , gpu_arch(gpu_arch)
     {
         half_N = attach_halfN ? (N + 1) / 2 : 0;
     }
@@ -262,8 +247,8 @@
     size_t N2;
 
 public:
-    TwiddleTable2D(size_t _N1, size_t _N2)
-        : TwiddleTable<T>(0, 0, false)
+    TwiddleTable2D(rocfft_precision precision, const std::string& gpu_arch, size_t _N1, size_t _N2)
+        : TwiddleTable<T>(precision, gpu_arch, 0, 0, false)
         , N1(_N1)
         , N2(_N2)
     {
@@ -277,8 +262,6 @@
         if(radices1 == radices2)
             N2 = 0;
 
-        auto blockSize = TWIDDLES_THREADS;
-
         size_t              table_sz_1, maxElem_1, minElem_1;
         std::vector<size_t> radices_prod_1, radices_sum_prod_1;
 
@@ -304,60 +287,25 @@
             throw std::runtime_error("unable to allocate twiddle length "
                                      + std::to_string(table_sz));
 
-        auto device_data_ptr = static_cast<T*>(output.data());
-
-        auto num_radices_1 = radices1.size();
-
-        auto numBlocksX_1 = DivRoundingUp<size_t>(num_radices_1, blockSize);
-        auto numBlocksY_1 = DivRoundingUp<size_t>(maxElem_1 / minElem_1, blockSize);
-
-        radices_t radices1_device;
-        radices_t radices_prod_device_1;
-        radices_t radices_sum_prod_device_1;
-        std::copy(radices1.begin(), radices1.end(), radices1_device.data);
-        std::copy(radices_prod_1.begin(), radices_prod_1.end(), radices_prod_device_1.data);
-        std::copy(
-            radices_sum_prod_1.begin(), radices_sum_prod_1.end(), radices_sum_prod_device_1.data);
-
-        hipLaunchKernelGGL(GenerateTwiddleTableKernel<T>,
-                           dim3(numBlocksX_1, numBlocksY_1),
-                           dim3(blockSize, blockSize),
-                           0, // sharedMemBytes
-                           stream,
-                           N1,
-                           num_radices_1,
-                           radices1_device,
-                           radices_prod_device_1,
-                           radices_sum_prod_device_1,
-                           device_data_ptr);
-
+        auto device_data_ptr          = static_cast<T*>(output.data());
+        TwiddleTable<T>::length_limit = N1;
+        TwiddleTable<T>::launch_radices_kernel(radices1,
+                                               radices_prod_1,
+                                               radices_sum_prod_1,
+                                               maxElem_1,
+                                               minElem_1,
+                                               stream,
+                                               device_data_ptr);
         if(N2)
         {
-            auto num_radices_2 = radices2.size();
-
-            auto numBlocksX_2 = DivRoundingUp<size_t>(num_radices_2, blockSize);
-            auto numBlocksY_2 = DivRoundingUp<size_t>(maxElem_2 / minElem_2, blockSize);
-
-            radices_t radices2_device;
-            radices_t radices_prod_device_2;
-            radices_t radices_sum_prod_device_2;
-            std::copy(radices2.begin(), radices2.end(), radices2_device.data);
-            std::copy(radices_prod_2.begin(), radices_prod_2.end(), radices_prod_device_2.data);
-            std::copy(radices_sum_prod_2.begin(),
-                      radices_sum_prod_2.end(),
-                      radices_sum_prod_device_2.data);
-
-            hipLaunchKernelGGL(GenerateTwiddleTableKernel<T>,
-                               dim3(numBlocksX_2, numBlocksY_2),
-                               dim3(blockSize, blockSize),
-                               0, // sharedMemBytes
-                               stream,
-                               N2,
-                               num_radices_2,
-                               radices2_device,
-                               radices_prod_device_2,
-                               radices_sum_prod_device_2,
-                               device_data_ptr + table_sz_1);
+            TwiddleTable<T>::length_limit = N2;
+            TwiddleTable<T>::launch_radices_kernel(radices2,
+                                                   radices_prod_2,
+                                                   radices_sum_prod_2,
+                                                   maxElem_2,
+                                                   minElem_2,
+                                                   stream,
+                                                   device_data_ptr + table_sz_1);
         }
     }
 };
@@ -373,10 +321,18 @@
     size_t X, Y;
     size_t tableSize;
 
+    const rocfft_precision precision;
+    const std::string      gpu_arch;
+
 public:
-    TwiddleTableLarge(size_t length, size_t base = LTWD_BASE_DEFAULT)
+    TwiddleTableLarge(rocfft_precision   precision,
+                      const std::string& gpu_arch,
+                      size_t             length,
+                      size_t             base = LTWD_BASE_DEFAULT)
         : N(length)
         , largeTwdBase(base)
+        , precision(precision)
+        , gpu_arch(gpu_arch)
     {
         X         = static_cast<size_t>(1) << largeTwdBase; // ex: 2^8 = 256
         Y         = DivRoundingUp<size_t>(CeilPo2(N), largeTwdBase);
@@ -401,22 +357,23 @@
         auto numBlocksX = DivRoundingUp<size_t>(X, blockSize);
         auto numBlocksY = DivRoundingUp<size_t>(Y, blockSize);
 
-        hipLaunchKernelGGL(GenerateTwiddleTableLargeKernel<T>,
-                           dim3(numBlocksX, numBlocksY),
-                           dim3(blockSize, blockSize),
-                           0, // sharedMemBytes
-                           stream,
-                           phi,
-                           largeTwdBase,
-                           X,
-                           Y,
-                           static_cast<T*>(output.data()));
+        auto kernel = RTCKernelTwiddle::generate(gpu_arch, TwiddleTableType::LARGE, precision);
+        RTCKernelArgs kargs;
+        kargs.append_double(phi);
+        kargs.append_size_t(largeTwdBase);
+        kargs.append_size_t(X);
+        kargs.append_size_t(Y);
+        kargs.append_ptr(output.data());
+
+        kernel.launch(kargs, dim3(numBlocksX, numBlocksY), dim3(blockSize, blockSize), 0, stream);
     }
 };
 
 template <typename T>
 gpubuf twiddles_create_pr(size_t                     N,
                           size_t                     length_limit,
+                          rocfft_precision           precision,
+                          const char*                gpu_arch,
                           size_t                     largeTwdBase,
                           bool                       attach_halfN,
                           const std::vector<size_t>& radices,
@@ -440,7 +397,7 @@
 
     if((N <= LARGE_TWIDDLE_THRESHOLD) && largeTwdBase == 0)
     {
-        TwiddleTable<T> twTable(N, length_limit, attach_halfN);
+        TwiddleTable<T> twTable(precision, gpu_arch, N, length_limit, attach_halfN);
         twTable.GenerateTwiddleTable(radices, stream, twts);
     }
     else
@@ -449,12 +406,13 @@
 
         if(largeTwdBase == 0)
         {
-            TwiddleTable<T> twTable(N, length_limit, attach_halfN);
+            TwiddleTable<T> twTable(precision, gpu_arch, N, length_limit, attach_halfN);
             twTable.GenerateTwiddleTable(radices, stream, twts);
         }
         else
         {
-            TwiddleTableLarge<T> twTable(N, largeTwdBase); // does not generate radices
+            TwiddleTableLarge<T> twTable(
+                precision, gpu_arch, N, largeTwdBase); // does not generate radices
             twTable.GenerateTwiddleTable(stream, twts);
         }
     }
@@ -468,27 +426,29 @@
 gpubuf twiddles_create(size_t                     N,
                        size_t                     length_limit,
                        rocfft_precision           precision,
+                       const char*                gpu_arch,
                        size_t                     largeTwdBase,
                        bool                       attach_halfN,
                        const std::vector<size_t>& radices,
                        unsigned int               deviceId)
 {
-    if(precision == rocfft_precision_single)
-        return twiddles_create_pr<float2>(
-            N, length_limit, largeTwdBase, attach_halfN, radices, deviceId);
-    else if(precision == rocfft_precision_double)
-        return twiddles_create_pr<double2>(
-            N, length_limit, largeTwdBase, attach_halfN, radices, deviceId);
-    else
+    switch(precision)
     {
-        assert(false);
-        return {};
+    case rocfft_precision_single:
+        return twiddles_create_pr<rocfft_complex<float>>(
+            N, length_limit, precision, gpu_arch, largeTwdBase, attach_halfN, radices, deviceId);
+    case rocfft_precision_double:
+        return twiddles_create_pr<rocfft_complex<double>>(
+            N, length_limit, precision, gpu_arch, largeTwdBase, attach_halfN, radices, deviceId);
+    case rocfft_precision_half:
+        return twiddles_create_pr<rocfft_complex<_Float16>>(
+            N, length_limit, precision, gpu_arch, largeTwdBase, attach_halfN, radices, deviceId);
     }
 }
 
 template <typename T>
-gpubuf
-    twiddles_create_2D_pr(size_t N1, size_t N2, rocfft_precision precision, unsigned int deviceId)
+gpubuf twiddles_create_2D_pr(
+    size_t N1, size_t N2, rocfft_precision precision, const char* gpu_arch, unsigned int deviceId)
 {
     auto                kernel = function_pool::get_kernel(fpkey(N1, N2, precision));
     std::vector<size_t> radices1, radices2;
@@ -515,7 +475,7 @@
             throw std::runtime_error("hipStreamCreate failure");
     }
 
-    TwiddleTable2D<T> twTable(N1, N2);
+    TwiddleTable2D<T> twTable(precision, gpu_arch, N1, N2);
     twTable.GenerateTwiddleTable(radices1, radices2, stream, twts);
 
     if(hipStreamSynchronize(stream) != hipSuccess)
@@ -524,15 +484,17 @@
     return twts;
 }
 
-gpubuf twiddles_create_2D(size_t N1, size_t N2, rocfft_precision precision, unsigned int deviceId)
+gpubuf twiddles_create_2D(
+    size_t N1, size_t N2, rocfft_precision precision, const char* gpu_arch, unsigned int deviceId)
 {
-    if(precision == rocfft_precision_single)
-        return twiddles_create_2D_pr<float2>(N1, N2, precision, deviceId);
-    else if(precision == rocfft_precision_double)
-        return twiddles_create_2D_pr<double2>(N1, N2, precision, deviceId);
-    else
+    switch(precision)
     {
-        assert(false);
-        return {};
+    case rocfft_precision_single:
+        return twiddles_create_2D_pr<rocfft_complex<float>>(N1, N2, precision, gpu_arch, deviceId);
+    case rocfft_precision_double:
+        return twiddles_create_2D_pr<rocfft_complex<double>>(N1, N2, precision, gpu_arch, deviceId);
+    case rocfft_precision_half:
+        return twiddles_create_2D_pr<rocfft_complex<_Float16>>(
+            N1, N2, precision, gpu_arch, deviceId);
     }
 }
diff -Nru rocfft-5.5.0/scripts/perf/datagraphs.asy rocfft-5.7.1/scripts/perf/datagraphs.asy
--- rocfft-5.5.0/scripts/perf/datagraphs.asy	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/scripts/perf/datagraphs.asy	2023-08-09 16:19:51.000000000 +0000
@@ -37,6 +37,10 @@
 string primaryaxis = "time";
 string secondaryaxis = "speedup";
 
+bool dobars = true;
+bool dolegend = true;
+real Ncut = inf;
+
 usersetting();
 
 if(primaryaxis == "gflops") {
@@ -63,14 +67,31 @@
 // Data containers:
 datapoint[][] datapoints = new datapoint[testlist.length][];
 
+
 readfiles(testlist, datapoints);
 
+bool datapointless(datapoint a, datapoint b)
+{
+    return a.x < b.x;
+}
+
 pair[][] xyval = new real[testlist.length][];
 pair[][] ylowhigh = new real[testlist.length][];
 for(int n = 0; n < datapoints.length; ++n) {
+    datapoints[n] = sort(datapoints[n], datapointless);
     datapoints_to_xyvallowhigh(datapoints[n], xyval[n], ylowhigh[n]);
 }
 
+
+if(Ncut < inf) {
+    for(int n = 0; n < datapoints.length; ++n) {
+        while(xyval[n][xyval[n].length - 1].x > Ncut) {
+            xyval[n].pop();
+            ylowhigh[n].pop();
+        }
+    }
+}
+
 //write(xyval);
 //write(ylowhigh);
 
@@ -97,21 +118,25 @@
     string legend = myleg ? legends[n] : texify(testlist[n]);
     marker mark = marker(scale(0.5mm) * unitcircle, Draw(graphpen + solid));
 
-    // Compute the error bars:
-    pair[] dp; // high
-    pair[] dm; // low
-    for(int i = 0; i < xyval[n].length; ++i) {
-        dp.push((0, -xyval[n][i].y + ylowhigh[n][i].y));
-        dm.push((0, -xyval[n][i].y + ylowhigh[n][i].x));
+    if(dobars) {
+        // Compute the error bars:
+        pair[] dp; // high
+        pair[] dm; // low
+        for(int i = 0; i < xyval[n].length; ++i) {
+            dp.push((0, -xyval[n][i].y + ylowhigh[n][i].y));
+            dm.push((0, -xyval[n][i].y + ylowhigh[n][i].x));
+        }
+        //write(dp);
+        //write(dm);
+        errorbars(xyval[n], dp, dm, graphpen);
     }
-    //write(dp);
-    //write(dm);
-    errorbars(xyval[n], dp, dm, graphpen);
     
     // Actualy plot things:
     draw(graph(xyval[n]), graphpen, legend, mark);
+    
 }
 
+
 xaxis(xlabel, BottomTop, LeftTicks);
 
 yaxis(ylabel, (secondary_filenames != "") ? Left : LeftRight,RightTicks);
@@ -120,8 +145,10 @@
 //                                ? 60*plain.E + 40 *plain.N
 //                                 : 20*plain.E)  );
 //attach(legend(),point(plain.S), N);
-attach(legend(), point(S), 50*S);
-
+if(dolegend) {
+    attach(legend(), point(S), 50*S);
+}
+    
 if(secondary_filenames != "")
 {
   write("secondary_filenames: ", secondary_filenames);
@@ -134,7 +161,8 @@
     pair[][] xyval = new real[second_list.length][];
     pair[][] ylowhigh = new real[second_list.length][];
     for(int n = 0; n < datapoints.length; ++n) {
-      datapoints_to_xyvallowhigh(datapoints[n], xyval[n], ylowhigh[n]);
+        datapoints[n] = sort(datapoints[n], datapointless);
+        datapoints_to_xyvallowhigh(datapoints[n], xyval[n], ylowhigh[n]);
     }
     
     bool interval = true;
@@ -181,7 +209,8 @@
 
 	    
             yaxis(pic, secondaryaxis, Right, black, LeftTicks);
-	    attach(legend(pic), point(plain.E), 60*plain.E - 40 *plain.N  );
+            if(dolegend)
+                attach(legend(pic), point(plain.E), 60*plain.E - 40 *plain.N  );
             //attach(legend(pic), point(plain.S), 120*S);
         });
     add(secondarypic);
diff -Nru rocfft-5.5.0/scripts/perf/histogram.asy rocfft-5.7.1/scripts/perf/histogram.asy
--- rocfft-5.5.0/scripts/perf/histogram.asy	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/scripts/perf/histogram.asy	2023-08-09 16:19:51.000000000 +0000
@@ -27,7 +27,7 @@
 
 string filename;
 
-int nbinmult = 0;
+int nbinmult = 2;
 
 usersetting();
 
@@ -38,18 +38,31 @@
 file fin = input(filename).line();
 real[] a = fin;
 
+int N = nbinmult * bins(a);
 
-int N = bins(a);
-
-histogram(a, min(0,min(a)), max(0, max(a)), N, normalize=false, low=0, lightred, black, bars=true);
+histogram(a,
+          min(0,min(a)),
+          max(0, max(a)),
+          N,
+          normalize=false,
+          low=0,
+          lightred,
+          black,
+          bars=true);
 
 xequals(0.0);
 
 //label((min(a), 0), string(min(a), 3), 1.5S);
 //label((max(a), 0), string(max(a), 3), 1.5S);
 
-xaxis("Speedup \%", BottomTop, LeftTicks);
-yaxis("Number of Transforms", LeftRight, RightTicks(trailingzero));
+real Step = 0.0;
+if(max(a) - min(a) < 4) {
+    real order = ceil(log(max(a) - min(a))/log(10));
+    Step = 0.5 * 10**(order-1);
+}
+
+xaxis("Speedup \%", BottomTop, LeftTicks(Step=Step));
+yaxis("Number of Transforms", LeftRight, RightTicks);
 
 
 //add(legend(),point(E),20E);
diff -Nru rocfft-5.5.0/scripts/perf/perflib/__init__.py rocfft-5.7.1/scripts/perf/perflib/__init__.py
--- rocfft-5.5.0/scripts/perf/perflib/__init__.py	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/scripts/perf/perflib/__init__.py	2023-08-09 16:19:51.000000000 +0000
@@ -3,6 +3,7 @@
 import perflib.html
 import perflib.pdf
 import perflib.rider
+import perflib.tuner
 import perflib.timer
 import perflib.utils
 import perflib.accutest
diff -Nru rocfft-5.5.0/scripts/perf/perflib/analysis.py rocfft-5.7.1/scripts/perf/perflib/analysis.py
--- rocfft-5.5.0/scripts/perf/perflib/analysis.py	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/scripts/perf/perflib/analysis.py	2023-08-09 16:19:51.000000000 +0000
@@ -29,17 +29,31 @@
 from typing import List
 
 
-def confidence_interval(vals, alpha=0.95, nboot=2000):
+def confidence_interval(vals, measure, confidence, alpha=0.95, nboot=2000):
     """Compute the alpha-confidence interval for the given values using boot-strap resampling."""
-    medians = []
-    for iboot in range(nboot):
-        resample = []
-        for i in range(len(vals)):
-            resample.append(vals[random.randrange(len(vals))])
-        medians.append(np.median(resample))
-    medians = sorted(medians)
-    low = medians[int(np.floor(nboot * 0.5 * (1.0 - alpha)))]
-    high = medians[int(np.ceil(nboot * (1.0 - 0.5 * (1.0 - alpha))))]
+    if confidence == "bootstrap":
+        medians = []
+        for iboot in range(nboot):
+            resample = []
+            for i in range(len(vals)):
+                resample.append(vals[random.randrange(len(vals))])
+            if measure == "median":
+                medians.append(np.median(resample))
+            elif measure == "mean":
+                medians.append(np.mean(resample))
+        medians = sorted(medians)
+        low = medians[int(np.floor(nboot * 0.5 * (1.0 - alpha)))]
+        high = medians[int(np.ceil(nboot * (1.0 - 0.5 * (1.0 - alpha))))]
+    elif confidence == "stdev":
+        mean = np.mean(vals)
+        stdev = np.mean(vals)
+        # NB: assumes alpha \approx 95
+        low = mean - 2 * stdev
+        high = mean + 2 * stdev
+    else:
+        print("invalid value for confidence:", confidence)
+        import sys
+        sys.exit(1)
     return low, high
 
 
@@ -64,6 +78,7 @@
 def moods(reference: Run, others: List[Run]):
     """Perform Moods analysis..."""
     import scipy.stats
+    import numpy
     pvals = {}
     for rname, rdat in reference.dats.items():
         for other in others:
@@ -71,9 +86,24 @@
             for length in rdat.samples.keys():
                 s1 = rdat.samples[length].times
                 s2 = odat.samples[length].times
-                m1 = statistics.median(s1)
-                m2 = statistics.median(s2)
-                _, p, _, _ = scipy.stats.median_test(s1, s2)
+
+                if arguments.method == 'median':
+                    m1 = statistics.median(s1)
+                    m2 = statistics.median(s2)
+                elif arguments.mesaure == "mean":
+                    m1 = numpy.mean(s1)
+                    m2 = numpy.mean(s2)
+
+                if arguments.method == 'moods':
+                    _, pval, _, _ = scipy.stats.median_test(s1, s2)
+                elif arguments.method == 'ttest':
+                    _, pval = scipy.stats.ttest_ind(s1, s2)
+                elif arguments.method == 'mwu':
+                    _, pval = scipy.stats.mannwhitneyu(s1, s2)
+                else:
+                    print("unsupported statistical method")
+                    sys.exit(1)
+
                 pvals[other.path.name, rname,
-                      length] = MoodsResult(p, [m1, m2])
+                      length] = MoodsResult(pval, [m1, m2])
     return pvals
diff -Nru rocfft-5.5.0/scripts/perf/perflib/generators.py rocfft-5.7.1/scripts/perf/perflib/generators.py
--- rocfft-5.5.0/scripts/perf/perflib/generators.py	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/scripts/perf/perflib/generators.py	2023-08-09 16:19:51.000000000 +0000
@@ -29,6 +29,7 @@
 
 import itertools
 import logging
+import json
 
 from dataclasses import dataclass, field
 from pathlib import Path as path
@@ -68,8 +69,16 @@
     real: bool = False
     precision: str = "single"
     tag: str = None
+    min_wgs: int = 64
+    max_wgs: int = 512
     meta: Dict[str, str] = field(default_factory=dict)
 
+    def toJSON(self):
+        tuning_dict = self.__dict__
+        del tuning_dict['tag']
+        del tuning_dict['meta']
+        return tuning_dict
+
 
 @dataclass
 class VerbatimGenerator:
diff -Nru rocfft-5.5.0/scripts/perf/perflib/html.py rocfft-5.7.1/scripts/perf/perflib/html.py
--- rocfft-5.5.0/scripts/perf/perflib/html.py	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/scripts/perf/perflib/html.py	2023-08-09 16:19:51.000000000 +0000
@@ -46,7 +46,7 @@
     return ret
 
 
-def significance_colors(significance, threshold=0.05):
+def significance_colors(significance, threshold):
     ret = []
     for s in significance:
         if s < threshold:
@@ -121,7 +121,7 @@
 
 class HTMLFigure(BaseFigure):
 
-    def make(self):
+    def make(self, sig_threshold):
         from plotly import graph_objs as go
         data_frames = to_data_frames(self.primary, self.secondary)
         for df in data_frames:
@@ -225,7 +225,8 @@
                 values.append(
                     ["{:.4f}".format(x) for x in data_frames[i].speedup_pval])
                 fill_colors.append(
-                    significance_colors(data_frames[i].speedup_pval))
+                    significance_colors(data_frames[i].speedup_pval,
+                                        sig_threshold))
 
         table = go.Figure(data=[
             go.Table(
@@ -245,7 +246,7 @@
         self.table = table
 
 
-def make_html(figures, title, docdir, outdirs):
+def make_html(figures, title, docdir, outdirs, significance):
     # TODO: this needs to read the output from the post-processing;
     # graphing and post-processing should be separate.
 
diff -Nru rocfft-5.5.0/scripts/perf/perflib/pdf.py rocfft-5.7.1/scripts/perf/perflib/pdf.py
--- rocfft-5.5.0/scripts/perf/perflib/pdf.py	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/scripts/perf/perflib/pdf.py	2023-08-09 16:19:51.000000000 +0000
@@ -75,12 +75,11 @@
             secondary = [x.resolve() for x in self.secondary]
             asycmd.extend(['-u', f'secondary_filenames="{cjoin(secondary)}"'])
 
-        self.filename = (Path(self.docdir) / (self.tag + '.pdf')).resolve()
         asycmd.extend(['-o', self.filename])
 
         return [str(x) for x in asycmd]
 
-    def make(self):
+    def runasy(self):
         asycommand = self.asycmd()
         logging.info('ASY: ' + sjoin(asycommand))
 
@@ -90,7 +89,7 @@
         proc = subprocess.Popen(asycommand, cwd=top, stdout=fout, stderr=ferr)
 
         try:
-            proc.wait(timeout=3)
+            proc.wait(timeout=20)
         except subprocess.TimeoutExpired:
             logging.info("Asy command killed: " + sjoin(asycommand))
             proc.kill()
@@ -114,6 +113,9 @@
             print(cout)
             print(cerr)
 
+    def make(self, significance):
+        self.filename = (Path(self.docdir) / (self.tag + '.pdf')).resolve()
+
 
 gflopstext = '''\
 GFLOP/s are computed based on the Cooley--Tukey operation count \
@@ -131,7 +133,7 @@
 for the device.'''
 
 
-def make_tex(figs, docdir, outdirs, label, secondtype=None):
+def make_tex(figs, docdir, outdirs, label, significance, secondtype=None):
     """Generate PDF containing performance figures."""
 
     docdir = Path(docdir)
@@ -143,6 +145,7 @@
 \\usepackage{url}
 \\usepackage{hyperref}
 \\usepackage{float}
+\\usepackage{longtable}
 \\begin{document}
 \\hypersetup{
   pdfborder={0,0,0},
@@ -193,6 +196,10 @@
     df_all_bad = pandas.DataFrame()
 
     ncompare = 0
+    for idx, fig in enumerate(figs):
+        for p in fig.secondary:
+            df = pandas.read_csv(p, sep="\t", comment='#')
+            ncompare += len(df.index)
 
     # We need a list of speedups to compute the geometric mean via
     # sicpy.stats; the naive calculation suffers from issues with
@@ -218,10 +225,8 @@
             for row in df.itertuples(index=False):
                 speedups.append(row.speedup)
 
-            ncompare += len(df.index)
-
             # Significant results:
-            df_sig = df.loc[df['speedup_pval'] < 0.05]
+            df_sig = df.loc[df['speedup_pval'] <= significance]
 
             # Significant results that are good or bad:
             df_good = df_sig.loc[df_sig['speedup'] > 1]
@@ -231,9 +236,7 @@
 
                 df_all_good = pandas.concat([df_all_good, df_good])
 
-                figtex += "\\begin{table}[H]\n"
-                figtex += "\\centering\n"
-                figtex += "\\begin{tabular}{l|l|l|}\n"
+                figtex += "\\begin{longtable}{l|l|l|}\n"
                 figtex += "transform & speedup \% & significance\\\\ \n"
                 figtex += "\\hline\n"
                 for row in df_good.itertuples(index=False):
@@ -244,21 +247,22 @@
                     figtex += "$" + "\\times{}".join(str(x)
                                                      for x in length) + "$"
 
+                    if np.prod(batch) > 1:
+                        figtex += " by $" + "\\times{}".join(
+                            str(x) for x in batch) + "$"
+
                     speedup = '{0:.3f}'.format((row.speedup - 1) * 100)
                     pval = '{0:.3f}'.format(row.speedup_pval)
                     figtex += " & " + str(speedup) + " & " + str(pval) + "\\\\"
                 figtex += "\\hline\n"
-                figtex += "\\end{tabular}\n"
                 figtex += "\\caption{Improvements for " + fig.caption + "}\n"
-                figtex += "\\end{table}\n"
+                figtex += "\\end{longtable}\n"
 
             if not df_bad.empty:
 
                 df_all_bad = pandas.concat([df_all_bad, df_bad])
 
-                figtex += "\\begin{table}[H]\n"
-                figtex += "\\centering\n"
-                figtex += "\\begin{tabular}{l|l|l|}\n"
+                figtex += "\\begin{longtable}{l|l|l|}\n"
                 figtex += "transform & slowdown \% & significance\\\\ \n"
                 figtex += "\\hline\n"
                 for row in df_bad.itertuples(index=False):
@@ -278,16 +282,14 @@
                     pval = '{0:.3f}'.format(row.speedup_pval)
                     figtex += " & " + str(speedup) + " & " + str(pval) + "\\\\"
                 figtex += "\\hline\n"
-                figtex += "\\end{tabular}\n"
                 figtex += "\\caption{Regressions for " + fig.caption + "}\n"
-                figtex += "\\end{table}\n"
+                figtex += "\\end{longtable}\n"
 
         figtex += "\\clearpage\n"
 
     nspeedup = len(df_all_good.index)
     nslowdown = len(df_all_bad.index)
 
-    print("ncompare:", ncompare)
     print(
         "nspeedup  (" + label[1] + " is faster): " +
         " " * max(len(label[0]) - len(label[1]), 0), nspeedup)
@@ -337,7 +339,7 @@
 
         asyproc = subprocess.Popen(asycmd, cwd=top, stdout=fout, stderr=ferr)
         try:
-            asyproc.wait(timeout=5)
+            asyproc.wait(timeout=20)
         except subprocess.TimeoutExpired:
             logging.info("asy command killed: " + sjoin(asycmd))
             asyproc.kill()
@@ -364,6 +366,33 @@
 
     tex += figtex
 
+    if nspeedup > 0:
+        tex += "\\clearpage\n"
+        tex += "tokens for improved performance\n"
+        tex += "\\begin{tiny}"
+        tex += "\\begin{verbatim}"
+        for row in df_all_good.itertuples(index=False):
+            #print(row.token)
+
+            tex += str(row.token) + "\n"
+            #tex += "\\small\\texttt{" + str(row.token).replace("_", "\\_") + "}\n"
+            #tex += str(row.token).replace("_", "\\_") + "\n"
+        tex += "\\end{verbatim}"
+        tex += "\\end{tiny}"
+
+    if nslowdown > 0:
+        print("There were", nslowdown, "regressions.  The tokens are:")
+        tex += "\\clearpage\n"
+        tex += "tokens for regressed performance\n"
+        tex += "\\begin{tiny}"
+        tex += "\\begin{verbatim}"
+        for row in df_all_bad.itertuples(index=False):
+            print(row.token)
+            tex += str(row.token) + "\n"
+            #tex += "\\small\\texttt{" + str(row.token).replace("_", "\\_") + "}\n"
+        tex += "\\end{verbatim}"
+        tex += "\\end{tiny}"
+
     tex += "\n\\end{document}\n"
 
     fname = docdir / 'figs.tex'
diff -Nru rocfft-5.5.0/scripts/perf/perflib/rider.py rocfft-5.7.1/scripts/perf/perflib/rider.py
--- rocfft-5.5.0/scripts/perf/perflib/rider.py	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/scripts/perf/perflib/rider.py	2023-08-09 16:19:51.000000000 +0000
@@ -38,7 +38,8 @@
         device=None,
         libraries=None,
         verbose=False,
-        timeout=300):
+        timeout=300,
+        sequence=None):
     """Run rocFFT rider and return execution times."""
     cmd = [pathlib.Path(rider).resolve()]
 
@@ -50,13 +51,21 @@
     if libraries is not None:
         for library in libraries:
             cmd += ['--lib', pathlib.Path(library).resolve()]
+        if len(libraries) > 1:
+            # only use different randomizations if using dyna-rider
+            if sequence is not None:
+                cmd += ['--sequence', str(sequence)]
 
     cmd += ['-N', ntrial]
     cmd += ['-b', nbatch]
     if not inplace:
         cmd += ['-o']
-    if precision == 'double':
-        cmd += ['--double']
+    if precision == 'half':
+        cmd += ['--precision', 'half']
+    elif precision == 'single':
+        cmd += ['--precision', 'single']
+    elif precision == 'double':
+        cmd += ['--precision', 'double']
     if device is not None:
         cmd += ['--device', device]
 
@@ -101,10 +110,21 @@
     token = ""
     times = []
 
+    soltokenTag = "[SolToken]: "
+    soltoken = ""
+    matchTag = "[TokenMatch]: "
+    match = ""
+
     for line in cout.splitlines():
         if line.startswith(tokentoken):
             token = line[len(tokentoken):]
 
+    for line in cerr.splitlines():
+        if line.startswith(soltokenTag):
+            soltoken = line[len(soltokenTag):]
+        elif line.startswith(matchTag):
+            match = line[len(matchTag):]
+
     if proc.returncode == 0:
         for m in re.finditer('Execution gpu time: ([ 0-9.]*) ms', cout,
                              re.MULTILINE):
@@ -126,4 +146,4 @@
 
     success = proc.returncode == 0
 
-    return token, times, success
+    return token, times, success, soltoken, match
diff -Nru rocfft-5.5.0/scripts/perf/perflib/timer.py rocfft-5.7.1/scripts/perf/perflib/timer.py
--- rocfft-5.5.0/scripts/perf/perflib/timer.py	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/scripts/perf/perflib/timer.py	2023-08-09 16:19:51.000000000 +0000
@@ -41,6 +41,7 @@
     ntrial: int = 10
     verbose: bool = False
     timeout: float = 0
+    sequence: int = None
 
     def run_cases(self, generator):
 
@@ -54,7 +55,7 @@
         no_accutest_prob_count = 0
         for prob in generator.generate_problems():
             total_prob_count += 1
-            token, seconds, success = perflib.rider.run(
+            token, seconds, success, __, __ = perflib.rider.run(
                 self.rider,
                 prob.length,
                 direction=prob.direction,
@@ -66,7 +67,8 @@
                 device=self.device,
                 libraries=self.lib,
                 verbose=self.verbose,
-                timeout=self.timeout)
+                timeout=self.timeout,
+                sequence=self.sequence)
 
             if success:
                 for idx, vals in enumerate(seconds):
diff -Nru rocfft-5.5.0/scripts/perf/perflib/tuner.py rocfft-5.7.1/scripts/perf/perflib/tuner.py
--- rocfft-5.5.0/scripts/perf/perflib/tuner.py	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/scripts/perf/perflib/tuner.py	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,216 @@
+# Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+"""Rider launch utils."""
+
+import logging
+import pathlib
+import re
+import subprocess
+import time
+from perflib.utils import cjoin
+
+
+def run(tuner,
+        length,
+        direction=-1,
+        real=False,
+        inplace=True,
+        precision='single',
+        nbatch=1,
+        ntrial=1,
+        device=None,
+        verbose=False,
+        timeout=10):
+    """Run rocFFT tuner and return best solution"""
+    cmd = [pathlib.Path(tuner).resolve()]
+
+    if isinstance(length, int):
+        cmd += ['--length', length]
+    else:
+        cmd += ['--length'] + [cjoin([str(len) for len in length])]
+
+    cmd += ['-N', ntrial]
+    cmd += ['-b', nbatch]
+    if not inplace:
+        cmd += ['-o']
+    if precision == 'half':
+        cmd += ['--precision', 'half']
+    elif precision == 'single':
+        cmd += ['--precision', 'single']
+    elif precision == 'double':
+        cmd += ['--precision', 'double']
+    if device is not None:
+        cmd += ['--device', device]
+
+    if real:
+        if direction == -1:
+            cmd += ['-t', 2, '--itype', 2, '--otype', 3]
+        if direction == 1:
+            cmd += ['-t', 3, '--itype', 3, '--otype', 2]
+    else:
+        if direction == -1:
+            cmd += ['-t', 0]
+        if direction == 1:
+            cmd += ['-t', 1]
+
+    cmd = [str(x) for x in cmd]
+    logging.info('tunning: ' + ' '.join(cmd))
+    if verbose:
+        print('tunning: ' + ' '.join(cmd))
+
+    tokenToken = "Token: "
+    outFileToken = "[OUTPUT_FILE]: "
+    resultToken = "[Result]: "
+    token = ""
+    outFileName = ""
+    msg = "[Solution]:\n"
+
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+    for line in proc.stdout:
+        line = line.decode('utf-8').rstrip('\n')
+        print(line)
+        if line.startswith(tokenToken):
+            token = line[len(tokenToken):]
+        elif line.startswith(outFileToken):
+            outFileName = line[len(outFileToken):]
+        elif line.startswith(resultToken):
+            msg += line[len(resultToken):] + '\n'
+
+    try:
+        proc.wait(timeout=None if timeout == 0 else timeout)
+    except subprocess.TimeoutExpired:
+        logging.info("timeout expired. killed. Please check the process.")
+        proc.kill()
+    success = proc.returncode == 0
+
+    return token, outFileName, msg, success
+
+
+def accuracy_test(validator,
+                  length,
+                  direction=-1,
+                  real=False,
+                  inplace=True,
+                  precision='single',
+                  nbatch=1,
+                  token=None,
+                  timeout=10):
+    """Run rocFFT test."""
+    cmd = [pathlib.Path(validator).resolve()]
+
+    cmd += ['--gtest_filter=man*']
+
+    # use token if we have it
+    if token != None:
+        cmd += ['--token', token]
+    # else, specify each arg
+    else:
+        if isinstance(length, int):
+            cmd += ['--length', length]
+        else:
+            cmd += ['--length'] + list(length)
+
+        cmd += ['-b', nbatch]
+        if not inplace:
+            cmd += ['-o']
+        if precision == 'half':
+            cmd += ['--precision', 'half']
+        elif precision == 'single':
+            cmd += ['--precision', 'single']
+        elif precision == 'double':
+            cmd += ['--precision', 'double']
+
+        if real:
+            if direction == -1:
+                cmd += ['-t', 2, '--itype', 2, '--otype', 3]
+            if direction == 1:
+                cmd += ['-t', 3, '--itype', 3, '--otype', 2]
+        else:
+            if direction == -1:
+                cmd += ['-t', 0]
+            if direction == 1:
+                cmd += ['-t', 1]
+
+    cmd = [str(x) for x in cmd]
+    logging.info('accuracy testing: ' + ' '.join(cmd))
+    print('accuracy testing: ' + ' '.join(cmd))
+
+    passToken = "[  PASSED  ] 1 test"
+    passed = False
+
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+    for line in proc.stdout:
+        line = line.decode('utf-8').rstrip('\n')
+        if line.startswith(passToken):
+            print(line)
+            passed = True
+
+    try:
+        proc.wait(timeout=None if timeout == 0 else timeout)
+    except subprocess.TimeoutExpired:
+        logging.info("timeout expired. killed. Please check the process.")
+        proc.kill()
+    success = proc.returncode == 0
+
+    if not success:
+        print('[  FAILED  ]: ' + ' '.join(cmd))
+
+    return success
+
+
+def merge(merger,
+          base_file_path,
+          new_files,
+          new_probTokens,
+          out_file_path,
+          verbose=False,
+          timeout=30):
+    """Run rocFFT tuner with command merge"""
+
+    cmd = [pathlib.Path(merger).resolve()]
+
+    cmd += ['--command', '1']
+    cmd += ['--new_sol_file', str(new_files)]
+    cmd += ['--new_probkey', str(new_probTokens)]
+    cmd += ['--output_sol_file', str(out_file_path)]
+    if base_file_path is not None:
+        cmd += ['--base_sol_file', str(base_file_path)]
+
+    cmd = [str(x) for x in cmd]
+    logging.info('merging: ' + ' '.join(cmd))
+    if verbose:
+        print('merging: ' + ' '.join(cmd))
+
+    # cpp merger simply return code, so no need to capture msg
+    # but since the merger has some recursive operation on tree,
+    # so using wait is still good to prevent any infinity loop bug..
+    proc = subprocess.Popen(cmd)
+
+    try:
+        proc.wait(timeout=None if timeout == 0 else timeout)
+    except subprocess.TimeoutExpired:
+        logging.info("timeout expired. killed. Please check the process.")
+        proc.kill()
+    success = proc.returncode == 0
+
+    if not success:
+        print('Failed on merging:' + ' '.join(cmd))
+
+    return success
diff -Nru rocfft-5.5.0/scripts/perf/rocfft-perf rocfft-5.7.1/scripts/perf/rocfft-perf
--- rocfft-5.5.0/scripts/perf/rocfft-perf	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/scripts/perf/rocfft-perf	2023-08-09 16:19:51.000000000 +0000
@@ -153,51 +153,112 @@
 #
 
 
-def command_moods(arguments):
-    """Find significant (Moods) regressions."""
-    
-    path = Path(arguments.output)
+def command_test(arguments):
+    """Test for regressions."""
+
+    # FIXME: rename and replace above comment
 
     sig = arguments.significance
+    outdirs = [Path(x) for x in arguments.runs]
+    verbose = arguments.verbose
 
-    import pandas
+    significance = arguments.significance
+    bonferroni = arguments.bonferroni
+
+    all_runs = perflib.utils.read_runs(outdirs, verbose)
+
+    if len(all_runs) != 2:
+        print("Error: one must provide exactly two runs for statistical comparison")
+        sys.exit(1)
+
+    import numpy
+    import scipy.stats
 
     ncompare = 0
-    df_failures = pandas.DataFrame()
-    
-    sdats = sorted(list(path.glob('*.sdat')))
-    for sdat in sdats:
-        df = pandas.read_csv(sdat, delimiter='\t', comment='#')
-        ncompare += len(df.index)
-        if df_failures.empty:
-            df_failures = pandas.DataFrame(columns = df.columns)
-        df_failures = pandas.concat([df_failures, df.loc[(df['speedup_pval'] < sig)
-                                                         & (df['speedup'] < 1)]],
-                                    ignore_index = True, axis = 0 )
-
-    if len(df_failures.index) > 0:
-        print("Regressions:")
-        print("token\tspeedup\tsignificance")
-    for index, row in df_failures.iterrows():
-        print(row['token'],
-              '\t', '{0:.4e}'.format(1-row['speedup']),
-              '\t', '{0:.3f}'.format(row['speedup_pval']))
+    slower = []
+    faster = []
 
-    print()
-    print(len(df_failures.index), "regressions in", ncompare, "tests using significance",
-          arguments.significance)
-        
-    return 0 if df_failures.empty else 1
+    runs = perflib.utils.by_dat(all_runs)
+    refdir, testdir = outdirs
+
+
+    # In order to do the Bonferroni correction, We need to adjust the
+    # significance threshold based on the number of tests, so count
+    # them first.
+    for dat_name, dat_runs in runs.items():
+        refdat = dat_runs[refdir]
+        testdat = dat_runs[testdir]
+        for token, sample in refdat.get_samples():
+            if token not in testdat.samples:
+                continue
+            ncompare += 1
+    if bonferroni and ncompare > 0:
+        significance /= ncompare
+
+    for dat_name, dat_runs in runs.items():
+        refdat = dat_runs[refdir]
+        testdat = dat_runs[testdir]
+        for token, sample in refdat.get_samples():
+            if token not in testdat.samples:
+                continue
+
+            #print(token)
+            Avals = refdat.samples[token].times
+            Bvals = testdat.samples[token].times
+
+            pval = -1
+            if arguments.method == 'moods':
+                _, pval, _, _ = scipy.stats.median_test(Avals, Bvals)
+                if pval < significance:
+                    if statistics.median(Avals) > statistics.median(Bvals):
+                        faster.append(token)
+                    else:
+                        slower.append(token)
+            elif arguments.method == 'ttest':
+                _, pval = scipy.stats.ttest_ind(Avals, Bvals)
+                if pval < significance:
+                    if numpy.mean(Avals) > numpy.mean(Bvals):
+                        faster.append(token)
+                    else:
+                        slower.append(token)
+            elif  arguments.method == 'mwu':
+                _, pval = scipy.stats.mannwhitneyu(Avals, Bvals)
+                if pval < significance:
+                    if statistics.median(Avals) > statistics.median(Bvals):
+                        faster.append(token)
+                    else:
+                        slower.append(token)
+            else:
+                print("unsupported statistical method")
+                sys.exit(1)
+
+    if verbose:
+        print("faster:", faster)
+        print("slower:", slower)
+
+    print("nh0:", ncompare - (len(faster) + len(slower)))
+    print("nh1:", len(faster) + len(slower))
 
+    print("ncompare:", ncompare)
+    print("faster:", len(faster))
+    print("slower:", len(slower))
 
-def generate_mdat(dat):
-    confidence = [['token', 'median_sample', 'median_low', 'median_high']]
+    return len(slower) > 0
+
+
+def generate_mdat(dat, measure, confidence):
+    import numpy
+    vals = [['token', 'median_sample', 'median_low', 'median_high']]
     for token, sample in dat.get_samples():
-        median = statistics.median(sample.times)
-        low, high = perflib.analysis.confidence_interval(sample.times)
-        confidence.append([sample.label, median, low, high])
+        if measure == "median":
+            median = statistics.median(sample.times)
+        elif measure == "mean":
+            median = numpy.mean(sample.times)
+        low, high = perflib.analysis.confidence_interval(sample.times,
+                                                         measure=measure, confidence=confidence)
+        vals.append([sample.label, median, low, high])
     path = dat.path.with_suffix('.mdat')
-    perflib.utils.write_tsv(path, confidence, meta=dat.meta, overwrite=True)
+    perflib.utils.write_tsv(path, vals, meta=dat.meta, overwrite=True)
 
 
 def generate_pts_dat(dat):
@@ -296,10 +357,12 @@
     docdir = arguments.output
     verbose = arguments.verbose
 
+    import itertools
+
     if verbose:
         print("docdir:", docdir)
         print("outdirs:", outdirs)
-    
+
     outdirs = [Path(x) for x in outdirs]
 
     all_runs = perflib.utils.read_runs(outdirs, verbose)
@@ -307,7 +370,10 @@
     # median confidence intervals
     for run in all_runs:
         with Pool(None) as p:
-            p.map(generate_mdat, run.dats.values())
+            p.starmap(generate_mdat,
+                      itertools.product(run.dats.values(),
+                                        [arguments.measure],
+                                        [arguments.confidence]))
             p.map(generate_pts_dat, run.dats.values())
 
     # speedup and pvals
@@ -315,7 +381,7 @@
         docdir = Path(docdir)
         docdir.mkdir(parents=True, exist_ok=True)
 
-        import scipy.stats
+        import scipy.stats, numpy
 
         runs = perflib.utils.by_dat(all_runs)
         refdir, *otherdirs = outdirs
@@ -335,11 +401,23 @@
                     sample = refdat.samples[token]
                     Avals = refdat.samples[token].times
                     Bvals = otherdat.samples[token].times
-                    speedup = statistics.median(Avals) / statistics.median(
-                        Bvals)
+                    if arguments.measure == "median":
+                        speedup = statistics.median(Avals) / statistics.median(Bvals)
+                    elif arguments.measure == "mean":
+                        speedup = numpy.mean(Avals) / numpy.mean(Bvals)
                     low, high = perflib.analysis.ratio_confidence_interval(
                         Avals, Bvals)
-                    _, pval, _, _ = scipy.stats.median_test(Avals, Bvals)
+                    pval = -1
+                    if arguments.method == 'moods':
+                        _, pval, _, _ = scipy.stats.median_test(Avals, Bvals)
+                    elif arguments.method == 'ttest':
+                        _, pval = scipy.stats.ttest_ind(Avals, Bvals)
+                    elif  arguments.method == 'mwu':
+                        _, pval = scipy.stats.mannwhitneyu(Avals, Bvals)
+                    else:
+                        print("unsupported statistical method")
+                        sys.exit(1)
+
                     speedups.append([sample.token, speedup, low, high, pval])
                 path = docdir / (str(otherdat.path.parent.name) + '-over-' +
                                  str(refdat.path.parent.name) + '-' +
@@ -350,7 +428,10 @@
                                         overwrite=True)
 
 
-def command_generate(runs=None, label=None, output=None, significance=None, type='pdf', **kwargs):
+
+
+def command_generate(runs=None, label=None, output=None, significance=None, bonferroni=None,
+                     type='pdf', **kwargs):
     """Generate PDF/HTML/DOCX from run results."""
 
     import perflib.pdf
@@ -370,6 +451,8 @@
         label = [outdir.stem for outdir in outdirs]
     reference = perflib.utils.read_run(outdirs[0])
 
+    import pandas
+    ncompare = 0
     figures = []
     for datname in perflib.utils.list_runs(outdirs[0]):
         tag = datname.stem
@@ -382,17 +465,33 @@
             tag, docdir, outdirs)
         figure = Figure(tag, title, caption, docdir, label, primary, secondary,
                         figtype)
-        figure.make()
+        for p in figure.secondary:
+            df = pandas.read_csv(p, sep="\t", comment='#')
+            ncompare += len(df.index)
         figures.append(figure)
 
+    print("ncompare:", ncompare)
+    if bonferroni and ncompare > 0:
+        significance /= ncompare
+
+    for figure in figures:
+        figure.make(significance)
+
+    if type == 'pdf':
+        pool = Pool(None)
+        for figure in figures:
+            pool.map_async(Figure.runasy, [figure])
+        pool.close()
+        pool.join()
+
     if type == 'pdf':
         perflib.pdf.make_tex(figures, docdir, outdirs, label, significance)
     if type == 'html':
         title = f"Performance report: {perflib.utils.cjoin(outdirs)}"
-        perflib.html.make_html(figures, title, docdir, outdirs)
+        perflib.html.make_html(figures, title, docdir, outdirs, significance)
     if type == 'docx':
         import perflib.docx
-        perflib.docx.make_docx(figures, docdir, outdirs)
+        perflib.docx.make_docx(figures, docdir, outdirs, significance)
 
 
 def command_run(arguments):
@@ -453,7 +552,7 @@
     timer = perflib.timer.GroupedTimer()
     for attr in [
             'device', 'rider', 'accutest', 'lib', 'out', 'device', 'ntrial',
-            'verbose', 'timeout'
+            'verbose', 'timeout', 'sequence'
     ]:
         update(attr, timer, arguments)
 
@@ -466,7 +565,7 @@
     failed_tokens = timer.run_cases(filtered(generator))
 
     print()
-    
+
     logging.info("failed tokens: " + "\n".join(failed_tokens))
     print("failed tokens:\n" + "\n".join(failed_tokens))
 
@@ -483,6 +582,7 @@
     suite = arguments.suite
     format = arguments.format
     static = arguments.static
+    timeout = arguments.timeout
 
     # Use the short version of the hashes (default length: 7)
     if commit != None:
@@ -528,18 +628,21 @@
         timer1.lib = None
         timer1.out = [build1]
         timer1.ntrial = 20
+        timer1.timeout = timeout
 
         timer2 = perflib.timer.GroupedTimer()
         timer2.rider = build2 / 'rocfft-rider'
         timer2.lib = None
         timer2.out = [build2]
         timer2.ntrial = 20
+        timer2.timeout = timeout
         timers = [timer1, timer2]
     else:
         timer = perflib.timer.GroupedTimer()
         timer.rider = build1 / 'dyna-rocfft-rider'
         timer.lib = [lib1, lib2]
         timer.out = [build1, build2]
+        timer.timeout = timeout
         timers = [timer]
 
     specs = perflib.specs.get_machine_specs(timers[0].device)
@@ -555,14 +658,12 @@
     # post-process results
     arguments.runs = [build1, build2]
     arguments.output = output
+    arguments.label=[reference_label, label]
     command_post(arguments)
 
     # generate report
     for report_type in format:
-        command_generate(runs=[build1, build2],
-                         label=[reference_label, label],
-                         output=output,
-                         type=report_type)
+        command_generate(type=report_type, **vars(arguments))
 
 
 def command_bweff(arguments):
@@ -600,7 +701,12 @@
         for prob in generator.generate_problems():
 
             # determine appropriate batch size
-            elem_size_bytes = 8 if prob.precision == "single" else 16
+            if prob.precision == "half":
+                elem_size_bytes = 4
+            elif prob.precision == "single":
+                elem_size_bytes = 8
+            elif prob.precision == "double":
+                elem_size_bytes = 16
 
             for length in prob.length:
                 elem_size_bytes *= length
@@ -677,13 +783,22 @@
     logging.info("output: " + str(out))
     meta = {'title': "median values"}
     for key in medians:
-        records = [
-            key[0],  # token
-            key[1],  # index
-            key[2],  # scheme
-            statistics.median(medians[key][0]),  # duration_ms
-            statistics.median(medians[key][1])  # bw_efficiency_pct
-        ]
+        if arguments.mesaure == "median":
+            records = [
+                key[0],  # token
+                key[1],  # index
+                key[2],  # scheme
+                statistics.median(medians[key][0]),  # duration_ms
+                statistics.median(medians[key][1])  # bw_efficiency_pct
+            ]
+        elif arguments.mesaure == "mean":
+            records = [
+                key[0],  # token
+                key[1],  # index
+                key[2],  # scheme
+                numpy.mean(medians[key][0]),  # duration_ms
+                numpy.mean(medians[key][1])  # bw_efficiency_pct
+            ]
         perflib.utils.write_tsv(out, [records], meta=meta)
 
 
@@ -697,7 +812,7 @@
         prog='rocfft-perf',
         epilog="For a detailed usage overview, run: %(prog)s overview")
     parser.add_argument('-v', '--verbose', action='store_true', default=False)
-   
+
     subparsers = parser.add_subparsers(dest='command')
 
     subparsers.add_parser('overview', help='print a general usage overview')
@@ -708,20 +823,53 @@
     pdf_parser   = subparsers.add_parser('pdf', help='generate pdf plots')
     html_parser  = subparsers.add_parser('html', help='generate html plots')
     docx_parser  = subparsers.add_parser('docx', help='generate docx plots')
-    moods_parser = subparsers.add_parser('moods', help='perform moods test')
+    test_parser = subparsers.add_parser('test', help='test for regressions')
+    autoperf_parser = subparsers.add_parser(
+        'autoperf',
+        help='clone, build, run, post, and plot two rocFFT commits')
+
 
-    
-    for p in [post_parser, pdf_parser, html_parser, docx_parser, moods_parser]:
-        p.add_argument('output', type=str)
     for p in [post_parser, pdf_parser, html_parser, docx_parser]:
+        p.add_argument('output', type=str)
+    for p in [post_parser, pdf_parser, html_parser, docx_parser, test_parser]:
         p.add_argument('runs', type=str, nargs='+')
-        
-    for p in [pdf_parser, html_parser, docx_parser, moods_parser]:
+
+    for p in [post_parser, autoperf_parser]:
+        p.add_argument('--confidence',
+                       type=str,
+                       choices=["bootstrap", "stdev"],
+                       help="method for generating confidence interval",
+                       default="bootstrap")
+
+    for p in [post_parser, pdf_parser, test_parser, autoperf_parser]:
+        p.add_argument('--method',
+                       type=str,
+                       choices=["moods", "ttest", "mwu"],
+                       help="statistical method",
+                       default="moods")
+    for p in [post_parser, pdf_parser, html_parser, docx_parser, test_parser, autoperf_parser]:
+        p.add_argument('--measure',
+                       type=str,
+                       choices=["mean", "median"],
+                       help="measure of central tendancy: median or mean",
+                       default="median")
+    for p in [pdf_parser, html_parser, docx_parser, test_parser, autoperf_parser]:
         p.add_argument('--significance',
                        type=float,
                        help='moods significance threshold',
                        default=0.001)
-        
+        p.add_argument('--bonferroni',
+                       action='store_true',
+                       help='Apply Bonferroni significance correction')
+        p.add_argument('--no-bonferroni', dest='bonferroni',
+                       action='store_false')
+        p.set_defaults(bonferroni=True)
+        # Python 3.9+ method:
+        # p.add_argument('--bonferroni',
+        #                help='Apply Bonferroni significance correction',
+        #                type=bool,
+        #                action=argparse.BooleanOptionalAction,
+        #                default=True)
     for p in [pdf_parser, html_parser, docx_parser]:
         p.add_argument('-l',
                        '--label',
@@ -816,13 +964,18 @@
                             '--ntrial',
                             type=int,
                             help='number of trials',
-                            default=10)
+                            default=20)
     run_parser.add_argument(
         '-T',
         '--timeout',
         type=int,
         help='test timeout in seconds (0 disables timeout)',
         default=600)
+    run_parser.add_argument(
+        '--sequence',
+        type=int,
+        help='dyna-rider test sequence',
+        default=0)
     run_parser.add_argument('-f',
                             '--precision',
                             type=str,
@@ -833,9 +986,6 @@
                             type=str,
                             help='accuracy test executable path')
 
-    autoperf_parser = subparsers.add_parser(
-        'autoperf',
-        help='clone, build, run, post, and plot two rocFFT commits')
     autoperf_parser.add_argument('--workdir',
                                  type=str,
                                  help='Working directory',
@@ -876,6 +1026,12 @@
                                  help='Use static rider instead of dyna',
                                  action='store_true',
                                  default=False)
+    autoperf_parser.add_argument(
+        '-T',
+        '--timeout',
+        type=int,
+        help='test timeout in seconds (0 disables timeout)',
+        default=600)
 
     bweff_parser = subparsers.add_parser(
         'bweff', help='bandwidth efficiency collection')
@@ -923,12 +1079,12 @@
 
     if arguments.command == 'run':
         command_run(arguments)
-        
+
     if arguments.command == 'post':
         command_post(arguments)
-        
-    if arguments.command == 'moods':
-        sys.exit(command_moods(arguments))
+
+    if arguments.command == 'test':
+        sys.exit(command_test(arguments))
 
     if arguments.command == 'pdf':
         command_generate(type='pdf', **vars(arguments))
diff -Nru rocfft-5.5.0/scripts/perf/rocfft-tuner rocfft-5.7.1/scripts/perf/rocfft-tuner
--- rocfft-5.5.0/scripts/perf/rocfft-tuner	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/scripts/perf/rocfft-tuner	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,691 @@
+#!/usr/bin/env python3
+
+# Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+"""Offline tuning utilities for rocFFT.
+
+Overview
+========
+
+General workflow:
+
+- tune: runs a suite of FFTs to collect timing information
+- merge: post processes timing information to compute various statistics
+
+General arguments shared between tune and merge commands:
+
+--workspace :
+--tuner     :
+--rider     :
+
+Runs/subprocesses are logged to `rocfft-tuner.log`.
+
+
+Tune
+===
+
+The 'run' command drives rocfft-offline-tuner. To use this executable,
+you must add -DROCFFT_BUILD_OFFLINE_TUNER=ON to the build option. There
+is handy argument -t | --tuner for install.sh (`./install.sh -t or --tuner`)
+
+Test problems are generated using a `ProblemGenerator` and a filter.
+The problems generator is the same one as rocfft-perf (rider)
+
+See
+
+  $ rocfft-tuner -h
+
+Using the `--suite/-S` option, problems are loaded from a "suites"
+file.  The default suites file is `suites.py`.  Alternatively, you can
+load the suite named "qa1" from a file called "mysuites.py" like this:
+
+  $ rocfft-tuner -w [workspace] tune -s mysuites:qa1 ...
+
+That is, FILENAME:SUITENAME.
+
+All the output files are stored in the workspace directory.
+Sub-foler `ResultSolutions` contains the tuned solution map.
+Sub-foler `TuningCandidates` contains intermediate candidate solutions
+which are for debugging purpose.
+Sub-foler `TuningData` contains csv files recording some benchmarking
+numbers and kernel information, which is for analysis purpose.
+
+Merge
+===============
+
+Dynamic testing is enabled by specifying more than one `--lib/-i`
+option.  These are passed down to the rider, and hence it is assumed
+that the specific rider is a "dyna" rider.
+
+Multiple output directories are used to store the results.
+
+
+"""
+
+import argparse
+import logging
+import subprocess
+import statistics
+import sys
+import os
+import tempfile
+import re
+import collections
+import json
+import pathlib
+
+from os import listdir
+from os.path import isfile, join
+from copy import deepcopy
+from pathlib import Path
+
+from multiprocessing import Pool
+
+top = Path(__file__).resolve().parent
+sys.path.append(str(top))
+
+import perflib
+
+from perflib.generators import Problem
+from perflib.utils import flatten
+
+console = logging.StreamHandler()
+
+import types
+
+
+#
+# Helpers
+#
+def create_launcher(generator):
+    launcher = collections.defaultdict(list)
+    for problem in generator.generate_problems():
+        launcher['problems'].append(problem.toJSON())
+
+    return launcher
+
+
+# get gfx___ which is the prefix of the solution map
+def get_local_gpu_gfx():
+    try:
+        for line in subprocess.Popen(
+                args=["rocminfo"], stdout=subprocess.PIPE).stdout.readlines():
+            if b'amdgcn-amd-amdhsa--' in line:
+                # gfxNNN[:sramecc][:xnack+/-]
+                gcn_arch = line.split(b'--')[1].strip()
+                # raw gfxNNN
+                gfx_target = gcn_arch.split(b':')[0]
+                return gfx_target.decode('utf-8')
+    except:
+        pass
+    return ''
+
+
+# #
+# # Commands
+# #
+def command_tuning(arguments):
+    """Run tuning."""
+
+    if arguments.workspace:
+        workspace = Path(arguments.workspace)
+        workspace.mkdir(parents=True, exist_ok=True)
+        dump_folder = workspace / "TuningCandidates"
+        result_folder = workspace / "ResultSolutions"
+        csv_folder = workspace / "TuningData"
+        dump_folder.mkdir(parents=True, exist_ok=True)
+        result_folder.mkdir(parents=True, exist_ok=True)
+        csv_folder.mkdir(parents=True, exist_ok=True)
+        os.environ['TUNING_WORKSPACE'] = arguments.workspace
+    else:
+        print(
+            "Workspace not set. use -w /path/of/workspace before command arg")
+        return
+
+    if arguments.input:
+        tuning_metadata_file = open(arguments.input, 'r')
+        launcher = json.load(tuning_metadata_file)
+        print('load from json file, suite argument will be ignored')
+    elif arguments.suite:
+        # set up problems
+        generator = perflib.generators.SuiteProblemGenerator(arguments.suite)
+        launcher = create_launcher(generator)
+    else:
+        print(
+            "No input data, use -i /path/to/jsonfile or -s suite_name for input"
+        )
+        return
+
+    # check flag setting, could be from --input file or from argument list
+    if 'dump_candidates' not in launcher:
+        launcher['dump_candidates'] = arguments.dump
+    if 'tune_exact_prob' not in launcher:
+        launcher['tune_exact_prob'] = arguments.exact
+    if 'print_reject_reason' not in launcher:
+        launcher['print_reject_reason'] = arguments.print_reject
+    if 'overwrite_max_wgs' not in launcher and arguments.max_wgs is not None:
+        launcher['overwrite_max_wgs'] = arguments.max_wgs
+    if 'overwrite_min_wgs' not in launcher and arguments.min_wgs is not None:
+        launcher['overwrite_min_wgs'] = arguments.min_wgs
+
+    # remind users if we are using a global value
+    if 'overwrite_max_wgs' in launcher:
+        print("max-workgroup-sizes is globally overwritten to {}".format(
+            arguments.max_wgs))
+    if 'overwrite_min_wgs' in launcher:
+        print("min-workgroup-sizes is globally overwritten to {}".format(
+            arguments.min_wgs))
+
+    # dump the tuning configuration to a file, next time we can directly use it
+    tuning_metadata_file = open(workspace / 'tuning_meta_data.json', 'w')
+    json.dump(launcher, tuning_metadata_file)
+
+    # the launcher dict MUST contain the following 3 keys (since we have default value in args)
+    # set env variable: dump
+    if launcher['dump_candidates'] == True:
+        os.environ['DUMP_TUNING'] = '1'
+    elif 'DUMP_TUNING' in os.environ:
+        del os.environ['DUMP_TUNING']
+
+    # set env variable: match exact token
+    if launcher['tune_exact_prob'] == True:
+        os.environ['TUNE_EXACT_PROB'] = '1'
+    elif 'TUNE_EXACT_PROB' in os.environ:
+        del os.environ['TUNE_EXACT_PROB']
+
+    # set env variable: print rejection reason
+    if launcher['print_reject_reason'] == True:
+        os.environ['PRINT_REJECT_REASON'] = '1'
+    elif 'PRINT_REJECT_REASON' in os.environ:
+        del os.environ['PRINT_REJECT_REASON']
+
+    # log layer
+    os.environ['ROCFFT_LAYER'] = '64'
+
+    # enable cache file to speed up tuning
+    os.environ['ROCFFT_RTC_CACHE_PATH'] = 'rocFFT_kernel_cache.db'
+
+    tuning_summaries = []
+    single_summary = {}
+    merging_metadata = []
+    single_merging_meta = {}
+
+    for prob in launcher['problems']:
+        # overwritting min_ and max_wgs are optional, when not specified, we use the setting in each problem
+        os.environ['MAX_WGS'] = str(
+            launcher['overwrite_max_wgs']
+        ) if 'overwrite_max_wgs' in launcher else str(prob['max_wgs'])
+        os.environ['MIN_WGS'] = str(
+            launcher['overwrite_min_wgs']
+        ) if 'overwrite_min_wgs' in launcher else str(prob['min_wgs'])
+
+        token, outfile, summary, success = perflib.tuner.run(
+            arguments.tuner,
+            prob['length'],
+            direction=prob['direction'],
+            real=prob['real'],
+            inplace=prob['inplace'],
+            precision=prob['precision'],
+            nbatch=prob['nbatch'],
+            ntrial=10)
+        single_summary['problem'] = prob
+        single_summary['token'] = token
+        single_summary['outfile'] = outfile
+        single_summary['result'] = summary
+        single_summary['valid'] = success
+        tuning_summaries.append(deepcopy(single_summary))
+        if success:
+            single_merging_meta['problem'] = prob
+            single_merging_meta['outfile'] = outfile
+            single_merging_meta['token'] = token
+            merging_metadata.append(deepcopy(single_merging_meta))
+
+    del os.environ['ROCFFT_RTC_CACHE_PATH']
+    del os.environ['MIN_WGS']
+    del os.environ['MAX_WGS']
+
+    # print all summary
+    print('==================\n[Tuning Summaries]\n==================\n')
+    for summary in tuning_summaries:
+        print('===================================================')
+        print('[Token]: ' + summary['token'])
+        print(summary['result'])
+        if (summary['valid'] == False):
+            print('Tuning Failed.')
+        else:
+            print('[Export to File]: ' + summary['outfile'])
+
+    # write a summary file saving problems and output file and valid status,
+    # this file will be used in merging
+    merging_metadata_file = open(workspace / 'merging_meta_data.json', 'w')
+    json.dump(merging_metadata, merging_metadata_file)
+
+
+def validate(validator, testing_meta_data):
+
+    for single_meta in testing_meta_data:
+        prob = single_meta['problem']
+        new_single_solution_file = Path(single_meta['outfile'])
+        print("\nAccuracy test for the new solution {}...".format(
+            new_single_solution_file))
+
+        # check the solution map file
+        if not new_single_solution_file.exists():
+            print("\tNot valid solution map file found! Abort")
+            continue
+
+        # set the explicit solution map filepath for running test
+        os.environ['ROCFFT_READ_EXPLICIT_SOL_MAP_FILE'] = str(
+            new_single_solution_file)
+
+        # use token, change batch to at least 10
+        # since we are running validation, not perf
+        token = str(single_meta['token'])
+        if prob['nbatch'] < 10:
+            old_batch_str = '_batch_' + str(prob['nbatch']) + '_'
+            new_batch_str = '_batch_10_'
+            token = token.replace(old_batch_str, new_batch_str)
+
+        # run rocfft-test
+        single_meta['Valid'] = perflib.tuner.accuracy_test(
+            validator,
+            prob['length'],
+            direction=prob['direction'],
+            real=prob['real'],
+            inplace=prob['inplace'],
+            precision=str(prob['precision']),
+            nbatch=max(prob['nbatch'], 10),
+            token=token)
+
+        del os.environ['ROCFFT_READ_EXPLICIT_SOL_MAP_FILE']
+
+
+def output_merge_file(solution_merger, old_sol_filepath, insertions,
+                      output_filepath):
+    """Merge solutions."""
+
+    print("Step1: \n\tReading old solution map from: {}".format(
+        old_sol_filepath))
+    if not old_sol_filepath.exists():
+        old_sol_filepath = None
+
+    print("Step2:")
+    num_inserts = len(insertions)
+    for i in range(num_inserts):
+        if i > 0:
+            old_sol_filepath = output_filepath
+
+        new_sol = insertions[i]
+        probKey = str(new_sol['arch']) + ':' + str(new_sol['probToken'])
+        print("\t\tInserting new solutions of {}".format(probKey))
+
+        success = perflib.tuner.merge(solution_merger,
+                                      base_file_path=old_sol_filepath,
+                                      new_files=new_sol['solutionFile'],
+                                      new_probTokens=probKey,
+                                      out_file_path=output_filepath)
+
+        print("\t\t\tDone!" if success else "\t\t\tFailed!")
+
+    print("Step3:\n\tFinishing writing merge solution map to: {}".format(
+        output_filepath))
+
+
+def command_merging(arguments):
+    """Merge solutions."""
+
+    prefix = get_local_gpu_gfx()
+    sol_map_filename = Path(prefix + '_rocfft_solution_map.dat')
+
+    if arguments.workspace:
+        workspace = Path(arguments.workspace)
+    else:
+        print(
+            "Workspace not set. use -w /path/of/workspace before command arg")
+        return
+
+    if arguments.metafile:
+        merging_meta_file = open(arguments.metafile, 'r')
+        merging_meta_data = json.load(merging_meta_file)
+    else:
+        print("No input data, use --metafile=/path/of/merging-meta-file")
+        return
+
+    if arguments.outfolder:
+        outfolder = workspace / Path(arguments.outfolder)
+        outfolder.mkdir(parents=True, exist_ok=True)
+        output_file = outfolder / sol_map_filename
+    else:
+        print(
+            "output folder is not set. use -outfolder=/subfolder/of/outputfolder"
+        )
+        return
+
+    reference_sol_folder = Path(arguments.ref_sol_folder)
+    reference_file = reference_sol_folder / sol_map_filename
+
+    # check if everything is well...
+    print("\nThe local AMDGPU is {}".format(prefix))
+
+    os.environ['ROCFFT_LAYER'] = '64'
+
+    # we disable the build-in solution library,
+    os.environ['ROCFFT_USE_EMPTY_SOL_MAP'] = '1'
+
+    # Do validation first to make it more robust:
+    #   Our kernels might still have un-discovered bugs so we need to do that.
+    #   Finding incorrect kernels also helps us improving kernels.
+    print("\n###############################################")
+    print("First, run validation to skip wrong solutions..")
+    print("###############################################")
+    validate(arguments.validator, merging_meta_data)
+
+    RunRecord = {}
+
+    # First Run - reference
+    os.environ['ROCFFT_READ_SOL_MAP_FROM_FOLDER'] = str(reference_sol_folder)
+
+    print("\n###################################")
+    print("First run with reference solution..")
+    print("###################################\n")
+    print(
+        "The reference solution map file is {}...\n\n".format(reference_file))
+    if not reference_file.exists():
+        print("\tNot valid reference solution map file found! " \
+            "Will compare with the default plan/kernels in rocFFT\n")
+
+    for single_meta in merging_meta_data:
+        if not single_meta['Valid']:
+            continue
+
+        prob = single_meta['problem']
+        token, times, success, solToken, matchType = perflib.rider.run(
+            arguments.rider,
+            prob['length'],
+            direction=prob['direction'],
+            real=prob['real'],
+            inplace=prob['inplace'],
+            precision=str(prob['precision']),
+            nbatch=prob['nbatch'],
+            ntrial=20,
+            verbose=True)
+
+        times_str = ' '.join(str(t) for t in times)
+        print("\ntoken: " + token)
+        print("found root solution with key: " + solToken)
+        print("times: " + times_str + '\n')
+        print('---------------------------------------------------\n')
+        times = flatten(times)
+
+        RunRecord[token] = {
+            'RefTimes': times,
+            'RefSolToken': solToken,
+            'RefExactToken': (matchType == "FULL")
+        }
+
+    # Second Run - load new solution map from "ROCFFT_READ_EXPLICIT_SOL_MAP_FILE"
+    print("\n###################################")
+    print("Second run with new solution map...")
+    print("###################################\n")
+    for single_meta in merging_meta_data:
+        if not single_meta['Valid']:
+            continue
+
+        # get the <problem, out_solution_path> information
+        prob = single_meta['problem']
+        new_single_solution_file = Path(single_meta['outfile'])
+        # check the solution map file
+        print("The testing solution map file is {}...\n".format(
+            new_single_solution_file))
+        if not new_single_solution_file.exists():
+            print("\tNot valid solution map file found! Abort")
+            continue
+
+        # set the explicit solution map filepath
+        os.environ['ROCFFT_READ_EXPLICIT_SOL_MAP_FILE'] = str(
+            new_single_solution_file)
+        token, times, success, solToken, matchType = perflib.rider.run(
+            arguments.rider,
+            prob['length'],
+            direction=prob['direction'],
+            real=prob['real'],
+            inplace=prob['inplace'],
+            precision=str(prob['precision']),
+            nbatch=prob['nbatch'],
+            ntrial=20,
+            verbose=True)
+
+        times_str = ' '.join(str(t) for t in times)
+        print("\ntoken: " + token)
+        print("found root solution with key: " + solToken)
+        print("times: " + times_str + '\n')
+        print('---------------------------------------------------\n')
+        times = flatten(times)
+
+        RunRecord[token].update({
+            'NewTimes': times,
+            'NewSolToken': solToken,
+            'SolutionFile': new_single_solution_file,
+            'NewExactToken': (matchType == "FULL")
+        })
+
+    import scipy.stats
+
+    for token, result in RunRecord.items():
+        Avals = result['RefTimes']
+        Bvals = result['NewTimes']
+        result['RefTimes'] = statistics.median(Avals)
+        result['NewTimes'] = statistics.median(Bvals)
+        speedup = result['RefTimes'] / result['NewTimes']
+        _, pval, _, _ = scipy.stats.median_test(Avals, Bvals)
+        result['SpeedUp'] = speedup
+        result['Significance'] = pval
+
+    del os.environ['ROCFFT_USE_EMPTY_SOL_MAP']
+    del os.environ['ROCFFT_READ_SOL_MAP_FROM_FOLDER']
+    if 'ROCFFT_READ_EXPLICIT_SOL_MAP_FILE' in os.environ:
+        del os.environ['ROCFFT_READ_EXPLICIT_SOL_MAP_FILE']
+
+    NewSolutionsToInsert = []
+    InsertionInfo = {}
+
+    # Analysis Summary
+    print('\n==================\n[Merge Summaries]\n==================')
+    for single_meta in merging_meta_data:
+        if not single_meta['Valid']:
+            print('\n===================================================')
+            print("The solution in {}:".format(single_meta['outfile']))
+            print("\tFailed the accuracy test. Please be aware of this")
+            continue
+
+    for token, result in RunRecord.items():
+        # the problem is not contained in our new solution map, don't waste time comparing
+        # but it's possible RefSolToken is "" which means we didn't have the solution before
+        if result['NewSolToken'] == "":
+            continue
+
+        speed_up = result['SpeedUp']
+        signficance = result['Significance']
+        confident: bool = signficance <= 0.05
+        new_is_faster: bool = speed_up > 1.0 and confident
+        ref_is_min_matching: bool = result['RefExactToken'] is False
+        new_is_min_matching: bool = result['NewExactToken'] is False
+        can_keep_slow_ref_sol: bool = new_is_faster and ref_is_min_matching and not new_is_min_matching
+        keep_ref_solution: bool = (not new_is_faster) or can_keep_slow_ref_sol
+
+        if new_is_faster:
+            InsertionInfo['arch'] = prefix
+            InsertionInfo['probToken'] = result['NewSolToken']
+            InsertionInfo['solutionFile'] = result['SolutionFile']
+            NewSolutionsToInsert.append(deepcopy(InsertionInfo))
+
+        print('\n===================================================')
+        print("For problem {}:\n".format(token))
+        print("\tReference solution: {}".format(result['RefSolToken']))
+        print("\tTime (Median): {} ms ...".format(result['RefTimes']))
+        print("\t\t--> Token-matching type is {}-match".format(
+            'minimal' if ref_is_min_matching else 'exact'))
+        print("\t\t--> {} the original solution\n".format(
+            'KEEP' if keep_ref_solution else 'DISCARD'))
+
+        print("\tNew solution: {} from file: {}".format(
+            result['NewSolToken'], result['SolutionFile']))
+        print("\tTime (Median): {} ms ...".format(result['NewTimes']))
+        print("\tSpeed-Up run2 over run1: {} / Pval: {} : {}, {}".format(
+            speed_up, signficance, 'FASTER' if speed_up > 1.0 else 'SLOWER',
+            'Confident' if confident else 'Not Confident'))
+        print("\t\t--> Token-matching type is {}-match".format(
+            'minimal' if new_is_min_matching else 'exact'))
+        print("\t\t--> {} the new solution\n".format(
+            'PICK' if new_is_faster else 'DISCARD'))
+
+    output_merge_file(arguments.tuner, reference_file, NewSolutionsToInsert,
+                      output_file)
+
+
+#
+# Main
+#
+def main():
+    parser = argparse.ArgumentParser(prog='rocfft-tuner')
+
+    subparsers = parser.add_subparsers(dest='command')
+    tuning_parser = subparsers.add_parser('tune', help='tune problems')
+    merge_parser = subparsers.add_parser('merge', help='merge solutions')
+
+    #################
+    # Shared Arguments
+    #################
+    # parser can add arguments shared by both tuner and merger
+    parser.add_argument('-w',
+                        '--workspace',
+                        type=str,
+                        help='workspace folder keeping all the tuning data',
+                        default="./TUNING_WORKSPACE")
+
+    parser.add_argument(
+        '--tuner',
+        type=str,
+        help='tuner executable path, used as tuner and merger',
+        default='./build/release/library/src/rocfft_offline_tuner')
+
+    parser.add_argument('--rider',
+                        type=str,
+                        help='rocfft-rider executable path, used when merging',
+                        default='./build/release/clients/staging/rocfft-rider')
+
+    parser.add_argument('--validator',
+                        type=str,
+                        help='rocfft-test executable path, used when merging',
+                        default='./build/release/clients/staging/rocfft-test')
+
+    #################
+    # TUNING COMMAND
+    #################
+    tuning_parser.add_argument('-s',
+                               '--suite',
+                               type=str,
+                               help='suite',
+                               action='append')
+
+    tuning_parser.add_argument(
+        '--dump',
+        help='dump the candidates data file in the workspace, default False',
+        action='store_true',
+        default=False)
+
+    tuning_parser.add_argument(
+        '--exact',
+        help=
+        'output solution for the exact problem (including batch, stride, dist), default False',
+        action='store_true',
+        default=False)
+
+    tuning_parser.add_argument(
+        '--print_reject',
+        help='print the configuration rejection reason, default False',
+        action='store_true',
+        default=False)
+
+    tuning_parser.add_argument(
+        '--max_wgs',
+        type=int,
+        help=
+        'Overwrite tuning max workgroups size to the specified value for ALL kernels.',
+        default=None)
+
+    tuning_parser.add_argument(
+        '--min_wgs',
+        type=int,
+        help=
+        'Overwrite tuning min workgroups size to the specified value for ALL kernels.',
+        default=None)
+
+    tuning_parser.add_argument(
+        '-i',
+        '--input',
+        type=str,
+        help='input of tuning meta file including suite, dump, exact setting',
+        default='')
+
+    #################
+    # MERGE COMMAND
+    #################
+    merge_parser.add_argument(
+        '--ref_sol_folder',
+        type=str,
+        help=
+        'folder of the original solution map data, default is [repo_folder]/solution_map/',
+        default='./solution_map')
+
+    merge_parser.add_argument(
+        '--outfolder',
+        type=str,
+        help=
+        'folder of the merged solution map data (under workspace), must specify',
+        default=None)
+
+    merge_parser.add_argument(
+        '--metafile',
+        type=str,
+        help='path of the merging meta file, must specify',
+        default=None)
+
+    arguments = parser.parse_args()
+
+    if arguments.command == 'tune':
+        command_tuning(arguments)
+
+    if arguments.command == 'merge':
+        command_merging(arguments)
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    logging.basicConfig(filename='rocfft-tuner.log',
+                        format='%(asctime)s %(levelname)s: %(message)s',
+                        level=logging.DEBUG)
+
+    console.setLevel(logging.WARNING)
+    console.setFormatter(logging.Formatter('%(levelname)-8s: %(message)s'))
+    logging.getLogger('').addHandler(console)
+
+    main()
diff -Nru rocfft-5.5.0/scripts/perf/suites.py rocfft-5.7.1/scripts/perf/suites.py
--- rocfft-5.5.0/scripts/perf/suites.py	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/scripts/perf/suites.py	2023-08-09 16:19:51.000000000 +0000
@@ -25,10 +25,14 @@
 
 import numpy as np
 
+import perflib.specs
+
 all_precisions = ['single', 'double']
 all_directions = [-1, 1]
 all_inplaces = [True, False]
 all_reals = [True, False]
+def_tuning_min_wgs = 64
+def_tuning_max_wgs = 512
 
 # yapf: disable
 lengths = {
@@ -189,9 +193,9 @@
     ],
 
     'qa2d10b': [
+        (3125, 3125),
         (4096, 4096),
         (6561, 6561),
-        (3125, 3125),
     ],
 
     'qa3d10b': [
@@ -343,12 +347,25 @@
 
 # yield problem sizes with default precision, direction, etc
 def default_length_params(tag, lengths, nbatch, precisions=all_precisions, \
-    directions=all_directions, inplaces=all_inplaces, reals=all_reals):
+    directions=all_directions, inplaces=all_inplaces, reals=all_reals, min_wgs=def_tuning_min_wgs, max_wgs=def_tuning_max_wgs):
+
+    # workaround: disable failing token on gfx906
+    if perflib.specs.get_machine_specs(0).gpuid == '0x66a1':
+        gfx906 = True
+    else:
+        gfx906 = False
 
     for precision, direction, inplace, real in product(precisions, directions,
                                                        inplaces, reals):
         for length in lengths:
             length = (length, ) if isinstance(length, int) else length
+
+            # workaround: disable failing token on gfx906
+            if gfx906 and (length == [32768, 32768] and nbatch == 1
+                           and direction == -1 and not inplace and real
+                           and precision == 'single'):
+                continue
+
             yield Problem(length,
                           tag=mktag(tag, len(length), precision, direction,
                                     inplace, real),
@@ -356,7 +373,9 @@
                           direction=direction,
                           inplace=inplace,
                           real=real,
-                          precision=precision)
+                          precision=precision,
+                          min_wgs=min_wgs,
+                          max_wgs=max_wgs)
 
 
 def md():
@@ -369,7 +388,7 @@
     """AMD QA suite."""
 
     for length1 in [
-            8192, 10752, 18816, 21504, 32256, 43008, 16384, 19683, 15625, 16807
+            8192, 10752, 15625, 16384, 16807, 18816, 19683, 21504, 32256, 43008
     ]:
         for direction in [-1, 1]:
             yield Problem([length1],
@@ -389,13 +408,12 @@
                   real=False,
                   precision='double')
 
-    yield Problem((336, 336, 56),
-                  tag=mktag('qa3', 3, 'double', -1, False, False),
-                  nbatch=1,
-                  direction=-1,
-                  inplace=False,
-                  real=False,
-                  precision='double')
+    yield from default_length_params("336x336x56", [(336, 336, 56)],
+                                     1,
+                                     directions=[-1],
+                                     precisions=['double'],
+                                     inplaces=[True, False],
+                                     reals=[False])
 
     for length3 in lengths['md']:
         for direction in [-1, 1]:
@@ -406,7 +424,8 @@
                           direction=direction,
                           inplace=False,
                           real=True,
-                          precision='single')
+                          precision='single',
+                          meta={'figtype': 'bargraph'})
 
     for length in lengths['qa1d10b']:
         yield Problem([length],
@@ -630,6 +649,27 @@
                                      reals=[True])
 
 
+def batch_const_count():
+    # batch * length = 2^25 ... 2^30
+    for direction in [-1, 1]:
+        for precision in all_precisions:
+            for exp in [25, 26, 27, 28, 29, 30]:
+                for place in all_inplaces:
+                    for lexp in range(4, exp + 1):
+                        length = 2**lexp
+                        batch = 2**(exp - lexp)
+
+                        yield Problem([length],
+                                      tag=mktag("footprint2exp" + str(exp), 1,
+                                                precision, direction, False,
+                                                False),
+                                      nbatch=batch,
+                                      direction=direction,
+                                      inplace=place,
+                                      real=False,
+                                      precision=precision)
+
+
 def benchmarks():
     """Benchmarks: XXX"""
 
@@ -676,3 +716,114 @@
                                                     (4294967296)],
                                      1,
                                      reals=[False])
+
+
+def tuning_example():
+    """tuning 3 examples problems"""
+
+    yield from default_length_params("81_1d", [(81)],
+                                     60000,
+                                     directions=[-1],
+                                     precisions=['double'],
+                                     inplaces=[False],
+                                     reals=[False],
+                                     min_wgs=128,
+                                     max_wgs=256)
+
+    yield from default_length_params("81_2d", [(81, 81)],
+                                     8000,
+                                     directions=[-1],
+                                     precisions=['double'],
+                                     inplaces=[False],
+                                     reals=[False],
+                                     min_wgs=128,
+                                     max_wgs=256)
+
+    # batch=500 to enabling tuning with intrinsic buffer
+    yield from default_length_params("81_3d", [(81, 81, 81)],
+                                     500,
+                                     directions=[-1],
+                                     precisions=['double'],
+                                     inplaces=[False],
+                                     reals=[False],
+                                     min_wgs=128,
+                                     max_wgs=256)
+
+
+def tuning_suite():
+    """tuning"""
+
+    # basically, when tuning for single, we can tune wgs range from 128~512, for double, wgs range is 128~256
+    # but you can also change it for particular problem.
+    # But inside our cpp tuner implementation, min_wgs might still be automatically changed
+    # if the setting gives no any candidate (example, a len64 with min_wgs=128 might not derive any)
+
+    # complex transforms in suite qa.
+    for length1 in [
+            8192, 10000, 10752, 15625, 16384, 16807, 18816, 19683, 21504
+    ]:
+        for direction in [1]:
+            yield Problem([length1],
+                          tag=mktag("qa1", 1, 'double', direction, False,
+                                    False),
+                          nbatch=10000,
+                          direction=direction,
+                          inplace=False,
+                          real=False,
+                          precision='double',
+                          min_wgs=128,
+                          max_wgs=256)
+
+    # batch=5000 to enabling tuning with intrinsic buffer
+    # since batch 10000 causes memory offset > 2^32, buffer inst will be disabled
+    for length1 in [32256, 43008]:
+        for direction in [1]:
+            yield Problem([length1],
+                          tag=mktag("qa1", 1, 'double', direction, False,
+                                    False),
+                          nbatch=5000,
+                          direction=direction,
+                          inplace=False,
+                          real=False,
+                          precision='double',
+                          min_wgs=128,
+                          max_wgs=256)
+
+    # we'd like to search more for this problem, so min_wgs = 64, not 128
+    yield from default_length_params("336x336x56", [(336, 336, 56)],
+                                     1,
+                                     directions=[-1],
+                                     precisions=['double'],
+                                     inplaces=[True, False],
+                                     reals=[False],
+                                     max_wgs=256)
+
+    for length in lengths['qa1d10b']:
+        yield Problem([length],
+                      tag=mktag("qa1d10b", 1, 'single', -1, True, False),
+                      nbatch=10,
+                      direction=-1,
+                      inplace=True,
+                      real=False,
+                      precision='single',
+                      min_wgs=128)
+
+    for length2 in lengths['qa2d10b']:
+        yield Problem(length2,
+                      tag=mktag("qa2d10b", 2, 'single', -1, True, False),
+                      nbatch=10,
+                      direction=-1,
+                      inplace=True,
+                      real=False,
+                      precision='single',
+                      min_wgs=128)
+
+    for length3 in lengths['qa3d10b']:
+        yield Problem(length3,
+                      tag=mktag("qa3d10b", 3, 'single', -1, True, False),
+                      nbatch=10,
+                      direction=-1,
+                      inplace=True,
+                      real=False,
+                      precision='single',
+                      min_wgs=128)
diff -Nru rocfft-5.5.0/scripts/perf/utils.asy rocfft-5.7.1/scripts/perf/utils.asy
--- rocfft-5.5.0/scripts/perf/utils.asy	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/scripts/perf/utils.asy	2023-08-09 16:19:51.000000000 +0000
@@ -99,7 +99,7 @@
     for(int n = 0; n < filelist.length; ++n)
     {
         string filename = filelist[n];
-        write("filename: ", filename);
+        //write("filename: ", filename);
         file fin = input(filename).line();
         
         string hdr = "";
@@ -118,7 +118,7 @@
           // Separate the token from the data:
 	  int pos = find(line, '\t', 0);
 	  string token = substr(line, 0, pos);
-	  write("token: ", token);
+	  //write("token: ", token);
 	  string vals = substr(line, pos + 1, -1);
 	  //write("vals: ", vals);
 
@@ -150,25 +150,25 @@
               ++lenidx;
           }
 
-	  write("length: ", length);
+	  //write("length: ", length);
 
           // Get the data:
 	  lastpos = 0;
 	  pos = find(vals, '\t', lastpos);
 	  string smedian = substr(vals, lastpos, pos - lastpos);
-	  write("median: ", smedian);
+	  //write("median: ", smedian);
 	  lastpos = pos > 0 ? pos + 1 : -1;
 
           string slow, shigh;
           
           pos = find(vals, '\t', lastpos);
           slow = substr(vals, lastpos, pos - lastpos);
-          write("median low: ", slow);
+          //write("median low: ", slow);
           lastpos = pos > 0 ? pos + 1 : -1;
           
           pos = find(vals, '\t', lastpos);
           shigh = substr(vals, lastpos, pos - lastpos);
-          write("median high: ", shigh);
+          //write("median high: ", shigh);
           lastpos = pos > 0 ? pos + 1 : -1;
 
                     
@@ -176,7 +176,7 @@
 
           pos = find(vals, '\t', lastpos);
           string spval = substr(vals, lastpos, pos - lastpos);
-          write("pval: ", spval);
+          //write("pval: ", spval);
           lastpos = pos > 0 ? pos + 1 : -1;
           
           d.x = length[0];
diff -Nru rocfft-5.5.0/scripts/solmap-version-convert.py rocfft-5.7.1/scripts/solmap-version-convert.py
--- rocfft-5.5.0/scripts/solmap-version-convert.py	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/scripts/solmap-version-convert.py	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+
+# Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+import argparse
+import logging
+import subprocess
+import sys
+import os
+import pathlib
+
+from os import listdir
+from os.path import isfile, join
+from pathlib import Path
+
+console = logging.StreamHandler()
+
+
+# #
+# # Commands
+# #
+def version_check(arguments):
+    """Check versions of solution map and do convertion"""
+
+    input_folder = None
+
+    if arguments.infolder:
+        input_folder = Path(arguments.infolder)
+
+    if (input_folder is None) or (not input_folder.exists()):
+        print(
+            "input folder is not set or not existing. use -infolder=/inputfolder/of/solutionmaps"
+        )
+        return
+
+    if arguments.outfolder:
+        output_folder = Path(arguments.outfolder)
+        output_folder.mkdir(parents=True, exist_ok=True)
+    else:
+        print(
+            "output folder is not set. use -outfolder=/subfolder/of/outputfolder"
+        )
+        return
+
+    # pick files that are solution maps
+    map_filenames = [
+        f for f in listdir(input_folder)
+        if isfile(join(input_folder, f)) and '_rocfft_solution_map.dat' in f
+    ]
+
+    # we'll do this in the cpp
+    # os.environ['ROCFFT_USE_EMPTY_SOL_MAP'] = '1'
+
+    for filename in map_filenames:
+        print("checking file format version: " + str(filename))
+        cmd = [pathlib.Path(arguments.converter).resolve()]
+        cmd += ['--input_file', str(input_folder / filename)]
+        cmd += ['--output_file', str(output_folder / filename)]
+        cmd = [str(x) for x in cmd]
+        logging.info('checking file format version: ' + ' '.join(cmd))
+
+        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+        try:
+            proc.wait(timeout=None)
+        except subprocess.TimeoutExpired:
+            logging.info("killed")
+            proc.kill()
+        if proc.returncode != 0:
+            print('Failed on checking version: ' + ' '.join(cmd))
+
+        for line in proc.stdout:
+            line = line.decode('utf-8').rstrip('\n')
+            print(line + '\n')
+
+
+#
+# Main
+#
+def main():
+    parser = argparse.ArgumentParser(prog='solmap-version-convert')
+
+    parser.add_argument(
+        '--converter',
+        type=str,
+        help='converter executable path',
+        default='./build/release/library/src/rocfft_solmap_convert')
+
+    parser.add_argument(
+        '--infolder',
+        type=str,
+        help=
+        'folder of the original solution map data, default is [repo_folder]/solution_map/',
+        default='./solution_map')
+
+    parser.add_argument(
+        '--outfolder',
+        type=str,
+        help=
+        'folder of the version-coverted solution map data, default is [repo_folder]/solution_map/converted/',
+        default='./solution_map/converted')
+
+    arguments = parser.parse_args()
+
+    version_check(arguments)
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    logging.basicConfig(filename='solmap-version-convert.log',
+                        format='%(asctime)s %(levelname)s: %(message)s',
+                        level=logging.DEBUG)
+
+    console.setLevel(logging.WARNING)
+    console.setFormatter(logging.Formatter('%(levelname)-8s: %(message)s'))
+    logging.getLogger('').addHandler(console)
+
+    main()
diff -Nru rocfft-5.5.0/shared/array_validator.cpp rocfft-5.7.1/shared/array_validator.cpp
--- rocfft-5.5.0/shared/array_validator.cpp	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/shared/array_validator.cpp	2023-08-09 16:19:51.000000000 +0000
@@ -41,7 +41,7 @@
                                   const std::vector<size_t> s,
                                   const int                 verbose)
 {
-    size_t              l0, s0;
+    size_t              l0{0}, s0{0};
     std::vector<size_t> l1{}, s1{};
     for(unsigned int i = 0; i < l.size(); ++i)
     {
@@ -180,12 +180,16 @@
 
     // If the 2D faces are valid, check an axis vs a face for collisions:
     bool invalid = false;
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for(int idx = 0; idx < 3; ++idx)
     {
         if(!valid_length_stride_1d_multi(idx, l, s, verbose))
         {
+#ifdef _OPENMP
 #pragma omp cancel for
+#endif
             invalid = true;
         }
     }
@@ -215,12 +219,16 @@
 
     bool invalid = false;
     // Check that 1D vs 3D faces are valid:
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for(int idx0 = 0; idx0 < 4; ++idx0)
     {
         if(!valid_length_stride_1d_multi(idx0, l, s, verbose))
         {
+#ifdef _OPENMP
 #pragma omp cancel for
+#endif
             invalid = true;
         }
     }
@@ -249,7 +257,9 @@
     } while(std::next_permutation(v.begin(), v.end()));
 
     // Then loop over all of the permutations.
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for(size_t iperm = 0; iperm < perms.size(); ++iperm)
     {
         std::vector<size_t> l0(2);
@@ -300,7 +310,9 @@
 
         if(!valid_length_stride_multi_multi(l0, s0, l1, s1))
         {
+#ifdef _OPENMP
 #pragma omp cancel for
+#endif
             invalid = true;
         }
     }
@@ -375,7 +387,9 @@
 
         bool invalid = false;
         // Then loop over all of the permutations.
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
         for(size_t iperm = 0; iperm < perms.size(); ++iperm)
         {
             std::vector<size_t> l0(dim0);
@@ -427,7 +441,9 @@
 
             if(!valid_length_stride_multi_multi(l0, s0, l1, s1))
             {
+#ifdef _OPENMP
 #pragma omp cancel for
+#endif
                 invalid = true;
             }
         }
diff -Nru rocfft-5.5.0/shared/data_gen.h rocfft-5.7.1/shared/data_gen.h
--- rocfft-5.5.0/shared/data_gen.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/shared/data_gen.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,1070 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef DATA_GEN_H
+#define DATA_GEN_H
+
+// rocRAND can generate warnings if inline asm is not available for
+// some architectures.  data generation isn't performance-critical,
+// so just disable inline asm to prevent the warnings.
+#define ROCRAND_DISABLE_INLINE_ASM
+
+#include "../shared/arithmetic.h"
+#include "../shared/gpubuf.h"
+#include "../shared/rocfft_complex.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hiprand/hiprand.h>
+#include <hiprand/hiprand_kernel.h>
+#include <limits>
+#include <vector>
+
+static const unsigned int DATA_GEN_THREADS    = 32;
+static const unsigned int DATA_GEN_GRID_Y_MAX = 64;
+
+template <typename T>
+struct input_val_1D
+{
+    T val1;
+};
+
+template <typename T>
+struct input_val_2D
+{
+    T val1;
+    T val2;
+};
+
+template <typename T>
+struct input_val_3D
+{
+    T val1;
+    T val2;
+    T val3;
+};
+
+template <typename T>
+static input_val_1D<T> get_input_val(const T& val)
+{
+    return input_val_1D<T>{val};
+}
+
+template <typename T>
+static input_val_2D<T> get_input_val(const std::tuple<T, T>& val)
+{
+    return input_val_2D<T>{std::get<0>(val), std::get<1>(val)};
+}
+
+template <typename T>
+static input_val_3D<T> get_input_val(const std::tuple<T, T, T>& val)
+{
+    return input_val_3D<T>{std::get<0>(val), std::get<1>(val), std::get<2>(val)};
+}
+
+template <typename T>
+__device__ static size_t
+    compute_index(const input_val_1D<T>& length, const input_val_1D<T>& stride, size_t base)
+{
+    return (length.val1 * stride.val1) + base;
+}
+
+template <typename T>
+__device__ static size_t
+    compute_index(const input_val_2D<T>& length, const input_val_2D<T>& stride, size_t base)
+{
+    return (length.val1 * stride.val1) + (length.val2 * stride.val2) + base;
+}
+
+template <typename T>
+__device__ static size_t
+    compute_index(const input_val_3D<T>& length, const input_val_3D<T>& stride, size_t base)
+{
+    return (length.val1 * stride.val1) + (length.val2 * stride.val2) + (length.val3 * stride.val3)
+           + base;
+}
+
+template <typename T>
+static inline input_val_1D<T> make_zero_length(const input_val_1D<T>& whole_length)
+{
+    return input_val_1D<T>{0};
+}
+
+template <typename T>
+static inline input_val_2D<T> make_zero_length(const input_val_2D<T>& whole_length)
+{
+    return input_val_2D<T>{0, 0};
+}
+
+template <typename T>
+static inline input_val_3D<T> make_zero_length(const input_val_3D<T>& whole_length)
+{
+    return input_val_3D<T>{0, 0, 0};
+}
+
+template <typename T>
+__device__ static input_val_1D<T> get_length(const size_t i, const input_val_1D<T>& whole_length)
+{
+    auto xlen = whole_length.val1;
+
+    auto xidx = i % xlen;
+
+    return input_val_1D<T>{xidx};
+}
+
+template <typename T>
+__device__ static size_t get_batch(const size_t i, const input_val_1D<T>& whole_length)
+{
+    auto xlen = whole_length.val1;
+
+    auto yidx = i / xlen;
+
+    return yidx;
+}
+
+template <typename T>
+__device__ static input_val_2D<T> get_length(const size_t i, const input_val_2D<T>& whole_length)
+{
+    auto xlen = whole_length.val1;
+    auto ylen = whole_length.val2;
+
+    auto xidx = i % xlen;
+    auto yidx = i / xlen % ylen;
+
+    return input_val_2D<T>{xidx, yidx};
+}
+
+template <typename T>
+__device__ static size_t get_batch(const size_t i, const input_val_2D<T>& whole_length)
+{
+    auto xlen = whole_length.val1;
+    auto ylen = whole_length.val2;
+
+    auto zidx = i / xlen / ylen;
+
+    return zidx;
+}
+
+template <typename T>
+__device__ static input_val_3D<T> get_length(const size_t i, const input_val_3D<T>& whole_length)
+{
+    auto xlen = whole_length.val1;
+    auto ylen = whole_length.val2;
+    auto zlen = whole_length.val3;
+
+    auto xidx = i % xlen;
+    auto yidx = i / xlen % ylen;
+    auto zidx = i / xlen / ylen % zlen;
+
+    return input_val_3D<T>{xidx, yidx, zidx};
+}
+
+template <typename T>
+__device__ static size_t get_batch(const size_t i, const input_val_3D<T>& length)
+{
+    auto xlen = length.val1;
+    auto ylen = length.val2;
+    auto zlen = length.val3;
+
+    auto widx = i / xlen / ylen / zlen;
+
+    return widx;
+}
+
+template <typename Tint, typename Treal>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
+    generate_interleaved_data_kernel(const Tint             whole_length,
+                                     const Tint             zero_length,
+                                     size_t                 idist,
+                                     size_t                 isize,
+                                     const Tint             istride,
+                                     rocfft_complex<Treal>* data)
+{
+    auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
+                   + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
+    static_assert(sizeof(i) >= sizeof(isize));
+    if(i < isize)
+    {
+        auto i_length = get_length(i, whole_length);
+        auto i_batch  = get_batch(i, whole_length);
+        auto i_base   = i_batch * idist;
+
+        auto seed = compute_index(zero_length, istride, i_base);
+        auto idx  = compute_index(i_length, istride, i_base);
+
+        hiprandStatePhilox4_32_10 gen_state;
+        hiprand_init(seed, idx, 0, &gen_state);
+
+        data[idx].x = hiprand_uniform_double(&gen_state) - 0.5;
+        data[idx].y = hiprand_uniform_double(&gen_state) - 0.5;
+    }
+}
+
+template <typename Tint, typename Treal>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
+    generate_planar_data_kernel(const Tint whole_length,
+                                const Tint zero_length,
+                                size_t     idist,
+                                size_t     isize,
+                                const Tint istride,
+                                Treal*     real_data,
+                                Treal*     imag_data)
+{
+    auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
+                   + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
+    static_assert(sizeof(i) >= sizeof(isize));
+    if(i < isize)
+    {
+        auto i_length = get_length(i, whole_length);
+        auto i_batch  = get_batch(i, whole_length);
+        auto i_base   = i_batch * idist;
+
+        auto seed = compute_index(zero_length, istride, i_base);
+        auto idx  = compute_index(i_length, istride, i_base);
+
+        hiprandStatePhilox4_32_10 gen_state;
+        hiprand_init(seed, idx, 0, &gen_state);
+
+        real_data[idx] = hiprand_uniform_double(&gen_state) - 0.5;
+        imag_data[idx] = hiprand_uniform_double(&gen_state) - 0.5;
+    }
+}
+
+template <typename Tint, typename Treal>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
+    generate_real_data_kernel(const Tint whole_length,
+                              const Tint zero_length,
+                              size_t     idist,
+                              size_t     isize,
+                              const Tint istride,
+                              Treal*     data)
+{
+    auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
+                   + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
+    static_assert(sizeof(i) >= sizeof(isize));
+    if(i < isize)
+    {
+        auto i_length = get_length(i, whole_length);
+        auto i_batch  = get_batch(i, whole_length);
+        auto i_base   = i_batch * idist;
+
+        auto seed = compute_index(zero_length, istride, i_base);
+        auto idx  = compute_index(i_length, istride, i_base);
+
+        hiprandStatePhilox4_32_10 gen_state;
+        hiprand_init(seed, idx, 0, &gen_state);
+
+        data[idx] = hiprand_uniform_double(&gen_state) - 0.5;
+    }
+}
+
+// For complex-to-real transforms, the input data must be Hermitiam-symmetric.
+// That is, u_k is the complex conjugate of u_{-k}, where k is the wavevector in Fourier
+// space.  For multi-dimensional data, this means that we only need to store a bit more
+// than half of the complex values; the rest are redundant.  However, there are still
+// some restrictions:
+// * the origin and Nyquist value(s) must be real-valued
+// * some of the remaining values are still redundant, and you might get different results
+//   than you expect if the values don't agree.
+// Below are some example kernels which impose Hermitian symmetry on a complex array
+// of the given dimensions.
+
+// Kernels for imposing Hermitian symmetry on 1D
+// complex (interleaved/planar) data on the GPU.
+
+template <typename Tcomplex>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
+    impose_hermitian_symmetry_interleaved_1(Tcomplex*    x,
+                                            const size_t Nx,
+                                            const size_t xstride,
+                                            const size_t dist,
+                                            const size_t nbatch,
+                                            const bool   Nxeven)
+{
+    auto idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    static_assert(sizeof(idx) == sizeof(size_t));
+
+    if(idx < nbatch)
+    {
+        idx *= dist;
+
+        // The DC mode must be real-valued.
+        x[idx].y = 0.0;
+
+        if(Nxeven)
+        {
+            // Nyquist mode
+            auto pos = idx + (Nx / 2) * xstride;
+            x[pos].y = 0.0;
+        }
+    }
+}
+
+template <typename Tfloat>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
+    impose_hermitian_symmetry_planar_1(Tfloat*      xreal,
+                                       Tfloat*      ximag,
+                                       const size_t Nx,
+                                       const size_t xstride,
+                                       const size_t dist,
+                                       const size_t nbatch,
+                                       const bool   Nxeven)
+{
+    auto idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    static_assert(sizeof(idx) == sizeof(size_t));
+
+    if(idx < nbatch)
+    {
+        idx *= dist;
+
+        // The DC mode must be real-valued.
+        ximag[idx] = 0;
+
+        if(Nxeven)
+        {
+            // Nyquist mode
+            auto pos   = idx + (Nx / 2) * xstride;
+            ximag[pos] = 0;
+        }
+    }
+}
+
+// Kernels for imposing Hermitian symmetry on 2D
+// complex (interleaved/planar) data on the GPU.
+
+template <typename Tcomplex>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS* DATA_GEN_THREADS)
+    impose_hermitian_symmetry_interleaved_2(Tcomplex*    x,
+                                            const size_t Nx,
+                                            const size_t Ny,
+                                            const size_t xstride,
+                                            const size_t ystride,
+                                            const size_t dist,
+                                            const size_t nbatch,
+                                            const bool   Nxeven,
+                                            const bool   Nyeven)
+{
+    auto       idx = static_cast<size_t>(blockIdx.y) * blockDim.y + threadIdx.y;
+    const auto idy = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    static_assert(sizeof(idx) == sizeof(size_t));
+    static_assert(sizeof(idy) == sizeof(size_t));
+
+    if(idy < (Ny / 2 + 1) && idx < nbatch)
+    {
+        idx *= dist;
+
+        auto pos  = idx + idy * ystride;
+        auto cpos = idx + (idy == 0 ? 0 : (Ny - idy)) * ystride;
+
+        auto val = x[pos];
+
+        // DC mode:
+        if(idy == 0)
+            val.y = 0.0;
+
+        // Axes need to be symmetrized:
+        if(idy > 0 && idy < (Ny + 1) / 2)
+            val.y = -val.y;
+
+        // y-Nyquist
+        if(Nyeven && idy == Ny / 2)
+            val.y = 0.0;
+
+        x[cpos] = val;
+
+        if(Nxeven)
+        {
+            pos += (Nx / 2) * xstride;
+            cpos += (Nx / 2) * xstride;
+
+            val = x[pos];
+
+            // DC mode:
+            if(idy == 0)
+                val.y = 0;
+
+            // Axes need to be symmetrized:
+            if(idy > 0 && idy < (Ny + 1) / 2)
+                val.y = -val.y;
+
+            // y-Nyquist
+            if(Nyeven && idy == Ny / 2)
+                val.y = 0;
+
+            x[cpos] = val;
+        }
+    }
+}
+
+template <typename Tfloat>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS* DATA_GEN_THREADS)
+    impose_hermitian_symmetry_planar_2(Tfloat*      xreal,
+                                       Tfloat*      ximag,
+                                       const size_t Nx,
+                                       const size_t Ny,
+                                       const size_t xstride,
+                                       const size_t ystride,
+                                       const size_t dist,
+                                       const size_t nbatch,
+                                       const bool   Nxeven,
+                                       const bool   Nyeven)
+{
+    auto       idx = static_cast<size_t>(blockIdx.y) * blockDim.y + threadIdx.y;
+    const auto idy = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    static_assert(sizeof(idx) == sizeof(size_t));
+    static_assert(sizeof(idy) == sizeof(size_t));
+
+    if(idy < (Ny / 2 + 1) && idx < nbatch)
+    {
+        idx *= dist;
+
+        auto pos  = idx + idy * ystride;
+        auto cpos = idx + (idy == 0 ? 0 : (Ny - idy)) * ystride;
+
+        auto valreal = xreal[pos];
+        auto valimag = ximag[pos];
+
+        // DC mode:
+        if(idy == 0)
+            valimag = 0;
+
+        // Axes need to be symmetrized:
+        if(idy > 0 && idy < (Ny + 1) / 2)
+            valimag = -valimag;
+
+        // y-Nyquist
+        if(Nyeven && idy == Ny / 2)
+            valimag = 0;
+
+        xreal[cpos] = valreal;
+        ximag[cpos] = valimag;
+
+        if(Nxeven)
+        {
+            pos += (Nx / 2) * xstride;
+            cpos += (Nx / 2) * xstride;
+
+            valreal = xreal[pos];
+            valimag = ximag[pos];
+
+            // DC mode:
+            if(idy == 0)
+                valimag = 0;
+
+            // Axes need to be symmetrized:
+            if(idy > 0 && idy < (Ny + 1) / 2)
+                valimag = -valimag;
+
+            // y-Nyquist
+            if(Nyeven && idy == Ny / 2)
+                valimag = 0;
+
+            xreal[cpos] = valreal;
+            ximag[cpos] = valimag;
+        }
+    }
+}
+
+// Kernels for imposing Hermitian symmetry on 3D
+// complex (interleaved/planar) data on the GPU.
+
+template <typename Tcomplex>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS* DATA_GEN_THREADS* DATA_GEN_THREADS)
+    impose_hermitian_symmetry_interleaved_3(Tcomplex*    x,
+                                            const size_t Nx,
+                                            const size_t Ny,
+                                            const size_t Nz,
+                                            const size_t xstride,
+                                            const size_t ystride,
+                                            const size_t zstride,
+                                            const size_t dist,
+                                            const size_t nbatch,
+                                            const bool   Nxeven,
+                                            const bool   Nyeven,
+                                            const bool   Nzeven)
+{
+    const auto idy = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    const auto idz = static_cast<size_t>(blockIdx.y) * blockDim.y + threadIdx.y;
+    auto       idx = static_cast<size_t>(blockIdx.z) * blockDim.z + threadIdx.z;
+    static_assert(sizeof(idx) == sizeof(size_t));
+    static_assert(sizeof(idy) == sizeof(size_t));
+    static_assert(sizeof(idz) == sizeof(size_t));
+
+    if(idy < Ny && idz < Nz && idx < nbatch)
+    {
+        idx *= dist;
+
+        auto pos = idx + idy * ystride + idz * zstride;
+        auto cpos
+            = idx + (idy == 0 ? 0 : (Ny - idy)) * ystride + (idz == 0 ? 0 : (Nz - idz)) * zstride;
+
+        // Origin
+        if(idy == 0 && idz == 0)
+        {
+            x[pos].y = 0.0;
+        }
+
+        // y-Nyquist
+        if(Nyeven && idy == Ny / 2 && idz == 0)
+        {
+            x[pos].y = 0.0;
+        }
+
+        // z-Nyquist
+        if(Nzeven && idz == Nz / 2 && idy == 0)
+        {
+            x[pos].y = 0.0;
+        }
+
+        // yz-Nyquist
+        if(Nyeven && Nzeven && idy == Ny / 2 && idz == Nz / 2)
+        {
+            x[pos].y = 0.0;
+        }
+
+        // z-axis
+        if(idy == 0 && idz > 0 && idz < (Nz + 1) / 2)
+        {
+            x[cpos].x = x[pos].x;
+            x[cpos].y = -x[pos].y;
+        }
+
+        // y-Nyquist axis
+        if(Nyeven && idy == Ny / 2 && idz > 0 && idz < (Nz + 1) / 2)
+        {
+            x[cpos].x = x[pos].x;
+            x[cpos].y = -x[pos].y;
+        }
+
+        // y-axis
+        if(idy > 0 && idy < (Ny + 1) / 2 && idz == 0)
+        {
+            x[cpos].x = x[pos].x;
+            x[cpos].y = -x[pos].y;
+        }
+
+        // z-Nyquist axis
+        if(Nzeven && idz == Nz / 2 && idy > 0 && idy < (Ny + 1) / 2)
+        {
+            x[cpos].x = x[pos].x;
+            x[cpos].y = -x[pos].y;
+        }
+
+        // yz plane
+        if(idy > 0 && idy < (Ny + 1) / 2 && idz > 0 && idz < Nz)
+        {
+            x[cpos].x = x[pos].x;
+            x[cpos].y = -x[pos].y;
+        }
+
+        if(Nxeven)
+        {
+            pos += (Nx / 2) * xstride;
+            cpos += (Nx / 2) * xstride;
+            // Origin
+            if(idy == 0 && idz == 0)
+                x[pos].y = 0.0;
+
+            // y-Nyquist
+            if(Nyeven && idy == Ny / 2 && idz == 0)
+                x[pos].y = 0.0;
+
+            // z-Nyquist
+            if(Nzeven && idz == Nz / 2 && idy == 0)
+                x[pos].y = 0.0;
+
+            // yz-Nyquist
+            if(Nyeven && Nzeven && idy == Ny / 2 && idz == Nz / 2)
+                x[pos].y = 0.0;
+
+            // z-axis
+            if(idy == 0 && idz > 0 && idz < (Nz + 1) / 2)
+            {
+                x[cpos].x = x[pos].x;
+                x[cpos].y = -x[pos].y;
+            }
+
+            // y-Nyquist axis
+            if(Nyeven && idy == Ny / 2 && idz > 0 && idz < (Nz + 1) / 2)
+            {
+                x[cpos].x = x[pos].x;
+                x[cpos].y = -x[pos].y;
+            }
+
+            // y-axis
+            if(idy > 0 && idy < (Ny + 1) / 2 && idz == 0)
+            {
+                x[cpos].x = x[pos].x;
+                x[cpos].y = -x[pos].y;
+            }
+
+            // z-Nyquist axis
+            if(Nzeven && idz == Nz / 2 && idy > 0 && idy < (Ny + 1) / 2)
+            {
+                x[cpos].x = x[pos].x;
+                x[cpos].y = -x[pos].y;
+            }
+
+            // yz plane
+            if(idy > 0 && idy < (Ny + 1) / 2 && idz > 0 && idz < Nz)
+            {
+                x[cpos].x = x[pos].x;
+                x[cpos].y = -x[pos].y;
+            }
+        }
+    }
+}
+
+template <typename Tfloat>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS* DATA_GEN_THREADS* DATA_GEN_THREADS)
+    impose_hermitian_symmetry_planar_3(Tfloat*      xreal,
+                                       Tfloat*      ximag,
+                                       const size_t Nx,
+                                       const size_t Ny,
+                                       const size_t Nz,
+                                       const size_t xstride,
+                                       const size_t ystride,
+                                       const size_t zstride,
+                                       const size_t dist,
+                                       const size_t nbatch,
+                                       const bool   Nxeven,
+                                       const bool   Nyeven,
+                                       const bool   Nzeven)
+{
+    const auto idy = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    const auto idz = static_cast<size_t>(blockIdx.y) * blockDim.y + threadIdx.y;
+    auto       idx = static_cast<size_t>(blockIdx.z) * blockDim.z + threadIdx.z;
+    static_assert(sizeof(idx) == sizeof(size_t));
+    static_assert(sizeof(idy) == sizeof(size_t));
+    static_assert(sizeof(idz) == sizeof(size_t));
+
+    if(idy < Ny && idz < Nz && idx < nbatch)
+    {
+        idx *= dist;
+
+        auto pos = idx + idy * ystride + idz * zstride;
+        auto cpos
+            = idx + (idy == 0 ? 0 : (Ny - idy)) * ystride + (idz == 0 ? 0 : (Nz - idz)) * zstride;
+
+        // Origin
+        if(idy == 0 && idz == 0)
+        {
+            ximag[pos] = 0;
+        }
+
+        // y-Nyquist
+        if(Nyeven && idy == Ny / 2 && idz == 0)
+        {
+            ximag[pos] = 0;
+        }
+
+        // z-Nyquist
+        if(Nzeven && idz == Nz / 2 && idy == 0)
+        {
+            ximag[pos] = 0;
+        }
+
+        // yz-Nyquist
+        if(Nyeven && Nzeven && idy == Ny / 2 && idz == Nz / 2)
+        {
+            ximag[pos] = 0;
+        }
+
+        // z-axis
+        if(idy == 0 && idz > 0 && idz < (Nz + 1) / 2)
+        {
+            xreal[cpos] = xreal[pos];
+            ximag[cpos] = -ximag[pos];
+        }
+
+        // y-Nyquist axis
+        if(Nyeven && idy == Ny / 2 && idz > 0 && idz < (Nz + 1) / 2)
+        {
+            xreal[cpos] = xreal[pos];
+            ximag[cpos] = -ximag[pos];
+        }
+
+        // y-axis
+        if(idy > 0 && idy < (Ny + 1) / 2 && idz == 0)
+        {
+            xreal[cpos] = xreal[pos];
+            ximag[cpos] = -ximag[pos];
+        }
+
+        // z-Nyquist axis
+        if(Nzeven && idz == Nz / 2 && idy > 0 && idy < (Ny + 1) / 2)
+        {
+            xreal[cpos] = xreal[pos];
+            ximag[cpos] = -ximag[pos];
+        }
+
+        // yz plane
+        if(idy > 0 && idy < (Ny + 1) / 2 && idz > 0 && idz < Nz)
+        {
+            xreal[cpos] = xreal[pos];
+            ximag[cpos] = -ximag[pos];
+        }
+
+        if(Nxeven)
+        {
+            pos += (Nx / 2) * xstride;
+            cpos += (Nx / 2) * xstride;
+            // Origin
+            if(idy == 0 && idz == 0)
+                ximag[pos] = 0;
+
+            // y-Nyquist
+            if(Nyeven && idy == Ny / 2 && idz == 0)
+                ximag[pos] = 0;
+
+            // z-Nyquist
+            if(Nzeven && idz == Nz / 2 && idy == 0)
+                ximag[pos] = 0;
+
+            // yz-Nyquist
+            if(Nyeven && Nzeven && idy == Ny / 2 && idz == Nz / 2)
+                ximag[pos] = 0;
+
+            // z-axis
+            if(idy == 0 && idz > 0 && idz < (Nz + 1) / 2)
+            {
+                xreal[cpos] = xreal[pos];
+                ximag[cpos] = -ximag[pos];
+            }
+
+            // y-Nyquist axis
+            if(Nyeven && idy == Ny / 2 && idz > 0 && idz < (Nz + 1) / 2)
+            {
+                xreal[cpos] = xreal[pos];
+                ximag[cpos] = -ximag[pos];
+            }
+
+            // y-axis
+            if(idy > 0 && idy < (Ny + 1) / 2 && idz == 0)
+            {
+                xreal[cpos] = xreal[pos];
+                ximag[cpos] = -ximag[pos];
+            }
+
+            // z-Nyquist axis
+            if(Nzeven && idz == Nz / 2 && idy > 0 && idy < (Ny + 1) / 2)
+            {
+                xreal[cpos] = xreal[pos];
+                ximag[cpos] = -ximag[pos];
+            }
+
+            // yz plane
+            if(idy > 0 && idy < (Ny + 1) / 2 && idz > 0 && idz < Nz)
+            {
+                xreal[cpos] = xreal[pos];
+                ximag[cpos] = -ximag[pos];
+            }
+        }
+    }
+}
+
+// get grid dimensions for data gen kernel
+static dim3 generate_data_gridDim(const size_t isize)
+{
+    auto blockSize = DATA_GEN_THREADS;
+    // total number of blocks needed in the grid
+    auto numBlocks_setup = DivRoundingUp<size_t>(isize, blockSize);
+
+    // Total work items per dimension in the grid is counted in
+    // uint32_t.  Since each thread initializes one element, very
+    // large amounts of data will overflow this total size if we do
+    // all this work in one grid dimension, causing launch failure.
+    //
+    // CUDA also generally allows for effectively unlimited grid X
+    // dim, but Y and Z are more limited.
+    auto gridDim_y = std::min<unsigned int>(DATA_GEN_GRID_Y_MAX, numBlocks_setup);
+    auto gridDim_x = DivRoundingUp<unsigned int>(numBlocks_setup, DATA_GEN_GRID_Y_MAX);
+    return {gridDim_x, gridDim_y};
+}
+
+template <typename Tint, typename Treal>
+inline void generate_interleaved_data(const Tint&            whole_length,
+                                      const size_t           idist,
+                                      const size_t           isize,
+                                      const Tint&            istride,
+                                      rocfft_complex<Treal>* input_data)
+{
+    auto input_length = get_input_val(whole_length);
+    auto zero_length  = make_zero_length(input_length);
+    auto input_stride = get_input_val(istride);
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(generate_interleaved_data_kernel<decltype(input_length), Treal>),
+        generate_data_gridDim(isize),
+        dim3(DATA_GEN_THREADS),
+        0, // sharedMemBytes
+        0, // stream
+        input_length,
+        zero_length,
+        idist,
+        isize,
+        input_stride,
+        input_data);
+    auto err = hipGetLastError();
+    if(err != hipSuccess)
+        throw std::runtime_error("generate_interleaved_data_kernel launch failure: "
+                                 + std::string(hipGetErrorName(err)));
+}
+
+template <typename Tint, typename Treal>
+inline void generate_planar_data(const Tint&  whole_length,
+                                 const size_t idist,
+                                 const size_t isize,
+                                 const Tint&  istride,
+                                 Treal*       real_data,
+                                 Treal*       imag_data)
+{
+    auto input_length = get_input_val(whole_length);
+    auto zero_length  = make_zero_length(input_length);
+    auto input_stride = get_input_val(istride);
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_planar_data_kernel<decltype(input_length), Treal>),
+                       generate_data_gridDim(isize),
+                       dim3(DATA_GEN_THREADS),
+                       0, // sharedMemBytes
+                       0, // stream
+                       input_length,
+                       zero_length,
+                       idist,
+                       isize,
+                       input_stride,
+                       real_data,
+                       imag_data);
+    auto err = hipGetLastError();
+    if(err != hipSuccess)
+        throw std::runtime_error("generate_planar_data_kernel launch failure: "
+                                 + std::string(hipGetErrorName(err)));
+}
+
+template <typename Tint, typename Treal>
+inline void generate_real_data(const Tint&  whole_length,
+                               const size_t idist,
+                               const size_t isize,
+                               const Tint&  istride,
+                               Treal*       input_data)
+{
+    auto input_length = get_input_val(whole_length);
+    auto zero_length  = make_zero_length(input_length);
+    auto input_stride = get_input_val(istride);
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_real_data_kernel<decltype(input_length), Treal>),
+                       generate_data_gridDim(isize),
+                       dim3(DATA_GEN_THREADS),
+                       0, // sharedMemBytes
+                       0, // stream
+                       input_length,
+                       zero_length,
+                       idist,
+                       isize,
+                       input_stride,
+                       input_data);
+    auto err = hipGetLastError();
+    if(err != hipSuccess)
+        throw std::runtime_error("generate_real_data_kernel launch failure: "
+                                 + std::string(hipGetErrorName(err)));
+}
+
+template <typename Tcomplex>
+void impose_hermitian_symmetry_interleaved(const std::vector<size_t>& length,
+                                           const std::vector<size_t>& ilength,
+                                           const std::vector<size_t>& stride,
+                                           size_t                     dist,
+                                           size_t                     batch,
+                                           Tcomplex*                  input_data)
+{
+    auto blockSize = DATA_GEN_THREADS;
+
+    switch(length.size())
+    {
+    case 1:
+    {
+        const auto gridDim  = dim3(blockSize);
+        const auto blockDim = dim3(DivRoundingUp<size_t>(batch, blockSize));
+
+        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1<Tcomplex>,
+                           gridDim,
+                           blockDim,
+                           0,
+                           0,
+                           input_data,
+                           length[0],
+                           stride[0],
+                           dist,
+                           batch,
+                           length[0] % 2 == 0);
+
+        break;
+    }
+    case 2:
+    {
+        const auto gridDim  = dim3(blockSize, blockSize);
+        const auto blockDim = dim3(DivRoundingUp<size_t>(ilength[0], blockSize),
+                                   DivRoundingUp<size_t>(batch, blockSize));
+
+        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2<Tcomplex>,
+                           gridDim,
+                           blockDim,
+                           0,
+                           0,
+                           input_data,
+                           length[1],
+                           length[0],
+                           stride[1],
+                           stride[0],
+                           dist,
+                           batch,
+                           length[1] % 2 == 0,
+                           length[0] % 2 == 0);
+
+        break;
+    }
+    case 3:
+    {
+        const auto gridDim  = dim3(blockSize, blockSize, blockSize);
+        const auto blockDim = dim3(DivRoundingUp<size_t>(ilength[0], blockSize),
+                                   DivRoundingUp<size_t>(ilength[1], blockSize),
+                                   DivRoundingUp<size_t>(batch, blockSize));
+
+        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3<Tcomplex>,
+                           gridDim,
+                           blockDim,
+                           0,
+                           0,
+                           input_data,
+                           length[2],
+                           length[0],
+                           length[1],
+                           stride[2],
+                           stride[0],
+                           stride[1],
+                           dist,
+                           batch,
+                           length[2] % 2 == 0,
+                           length[0] % 2 == 0,
+                           length[1] % 2 == 0);
+        break;
+    }
+    default:
+        throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
+    }
+    auto err = hipGetLastError();
+    if(err != hipSuccess)
+        throw std::runtime_error("impose_hermitian_symmetry_interleaved launch failure: "
+                                 + std::string(hipGetErrorName(err)));
+}
+
+template <typename Tfloat>
+void impose_hermitian_symmetry_planar(const std::vector<size_t>& length,
+                                      const std::vector<size_t>& ilength,
+                                      const std::vector<size_t>& stride,
+                                      size_t                     dist,
+                                      size_t                     batch,
+                                      Tfloat*                    input_data_real,
+                                      Tfloat*                    input_data_imag)
+{
+    auto blockSize = DATA_GEN_THREADS;
+
+    switch(length.size())
+    {
+    case 1:
+    {
+        const auto gridDim  = dim3(blockSize);
+        const auto blockDim = dim3(DivRoundingUp<size_t>(batch, blockSize));
+
+        hipLaunchKernelGGL(impose_hermitian_symmetry_planar_1<Tfloat>,
+                           gridDim,
+                           blockDim,
+                           0,
+                           0,
+                           input_data_real,
+                           input_data_imag,
+                           length[0],
+                           stride[0],
+                           dist,
+                           batch,
+                           length[0] % 2 == 0);
+
+        break;
+    }
+    case 2:
+    {
+        const auto gridDim  = dim3(blockSize, blockSize);
+        const auto blockDim = dim3(DivRoundingUp<size_t>(ilength[0], blockSize),
+                                   DivRoundingUp<size_t>(batch, blockSize));
+
+        hipLaunchKernelGGL(impose_hermitian_symmetry_planar_2<Tfloat>,
+                           gridDim,
+                           blockDim,
+                           0,
+                           0,
+                           input_data_real,
+                           input_data_imag,
+                           length[1],
+                           length[0],
+                           stride[1],
+                           stride[0],
+                           dist,
+                           batch,
+                           length[1] % 2 == 0,
+                           length[0] % 2 == 0);
+
+        break;
+    }
+    case 3:
+    {
+        const auto gridDim  = dim3(blockSize, blockSize, blockSize);
+        const auto blockDim = dim3(DivRoundingUp<size_t>(ilength[0], blockSize),
+                                   DivRoundingUp<size_t>(ilength[1], blockSize),
+                                   DivRoundingUp<size_t>(batch, blockSize));
+
+        hipLaunchKernelGGL(impose_hermitian_symmetry_planar_3<Tfloat>,
+                           gridDim,
+                           blockDim,
+                           0,
+                           0,
+                           input_data_real,
+                           input_data_imag,
+                           length[2],
+                           length[0],
+                           length[1],
+                           stride[2],
+                           stride[0],
+                           stride[1],
+                           dist,
+                           batch,
+                           length[2] % 2 == 0,
+                           length[0] % 2 == 0,
+                           length[1] % 2 == 0);
+        break;
+    }
+    default:
+        throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
+    }
+    auto err = hipGetLastError();
+    if(err != hipSuccess)
+        throw std::runtime_error("impose_hermitian_symmetry_planar launch failure: "
+                                 + std::string(hipGetErrorName(err)));
+}
+
+#endif // DATA_GEN_H
diff -Nru rocfft-5.5.0/shared/enum_to_string.h rocfft-5.7.1/shared/enum_to_string.h
--- rocfft-5.5.0/shared/enum_to_string.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/shared/enum_to_string.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,81 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ENUM_TO_STRING_H
+#define ENUM_TO_STRING_H
+
+#include "fft_params.h"
+
+// Return the string of the hipError code.
+static std::string hipError_to_string(const hipError_t ret)
+{
+    switch(ret)
+    {
+    case hipSuccess:
+        return "hipSuccess";
+    case hipErrorInvalidContext:
+        return "hipErrorInvalidContext";
+    case hipErrorInvalidKernelFile:
+        return "hipErrorInvalidKernelFile";
+    case hipErrorMemoryAllocation:
+        return "hipErrorMemoryAllocation";
+    case hipErrorInitializationError:
+        return "hipErrorInitializationError";
+    case hipErrorLaunchFailure:
+        return "hipErrorLaunchFailure";
+    case hipErrorLaunchOutOfResources:
+        return "hipErrorLaunchOutOfResources";
+    case hipErrorInvalidDevice:
+        return "hipErrorInvalidDevice";
+    case hipErrorInvalidValue:
+        return "hipErrorInvalidValue";
+    case hipErrorInvalidDevicePointer:
+        return "hipErrorInvalidDevicePointer";
+    case hipErrorInvalidMemcpyDirection:
+        return "hipErrorInvalidMemcpyDirection";
+    case hipErrorUnknown:
+        return "hipErrorUnknown";
+    case hipErrorInvalidResourceHandle:
+        return "hipErrorInvalidResourceHandle";
+    case hipErrorNotReady:
+        return "hipErrorNotReady";
+    case hipErrorNoDevice:
+        return "hipErrorNoDevice";
+    case hipErrorPeerAccessAlreadyEnabled:
+        return "hipErrorPeerAccessAlreadyEnabled";
+    case hipErrorPeerAccessNotEnabled:
+        return "hipErrorPeerAccessNotEnabled";
+    case hipErrorRuntimeMemory:
+        return "hipErrorRuntimeMemory";
+    case hipErrorRuntimeOther:
+        return "hipErrorRuntimeOther";
+    case hipErrorHostMemoryAlreadyRegistered:
+        return "hipErrorHostMemoryAlreadyRegistered";
+    case hipErrorHostMemoryNotRegistered:
+        return "hipErrorHostMemoryNotRegistered";
+    case hipErrorMapBufferObjectFailed:
+        return "hipErrorMapBufferObjectFailed";
+    case hipErrorTbd:
+        return "hipErrorTbd";
+    default:
+        throw std::runtime_error("unknown hipError");
+    }
+}
+#endif
diff -Nru rocfft-5.5.0/shared/fft_params.h rocfft-5.7.1/shared/fft_params.h
--- rocfft-5.5.0/shared/fft_params.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/shared/fft_params.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,3062 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef FFT_PARAMS_H
+#define FFT_PARAMS_H
+
+#include <algorithm>
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <mutex>
+#include <numeric>
+#include <sstream>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include <random>
+#include <tuple>
+#include <unordered_set>
+#include <vector>
+
+#include "../shared/arithmetic.h"
+#include "../shared/array_validator.h"
+#include "../shared/data_gen.h"
+#include "../shared/printbuffer.h"
+#include "../shared/ptrdiff.h"
+
+enum fft_status
+{
+    fft_status_success,
+    fft_status_failure,
+    fft_status_invalid_arg_value,
+    fft_status_invalid_dimensions,
+    fft_status_invalid_array_type,
+    fft_status_invalid_strides,
+    fft_status_invalid_distance,
+    fft_status_invalid_offset,
+    fft_status_invalid_work_buffer,
+};
+
+enum fft_transform_type
+{
+    fft_transform_type_complex_forward,
+    fft_transform_type_complex_inverse,
+    fft_transform_type_real_forward,
+    fft_transform_type_real_inverse,
+};
+
+enum fft_precision
+{
+    fft_precision_half,
+    fft_precision_single,
+    fft_precision_double,
+};
+
+static std::istream& operator>>(std::istream& str, fft_precision& precision)
+{
+    std::string word;
+    str >> word;
+
+    if(word == "half")
+        precision = fft_precision_half;
+    else if(word == "single")
+        precision = fft_precision_single;
+    else if(word == "double")
+        precision = fft_precision_double;
+    else
+        throw std::runtime_error("Invalid precision specified");
+    return str;
+}
+
+enum fft_array_type
+{
+    fft_array_type_complex_interleaved,
+    fft_array_type_complex_planar,
+    fft_array_type_real,
+    fft_array_type_hermitian_interleaved,
+    fft_array_type_hermitian_planar,
+    fft_array_type_unset,
+};
+
+enum fft_result_placement
+{
+    fft_placement_inplace,
+    fft_placement_notinplace,
+};
+
+// Determine the size of the data type given the precision and type.
+template <typename Tsize>
+inline Tsize var_size(const fft_precision precision, const fft_array_type type)
+{
+    size_t var_size = 0;
+    switch(precision)
+    {
+    case fft_precision_half:
+        var_size = sizeof(_Float16);
+        break;
+    case fft_precision_single:
+        var_size = sizeof(float);
+        break;
+    case fft_precision_double:
+        var_size = sizeof(double);
+        break;
+    }
+    switch(type)
+    {
+    case fft_array_type_complex_interleaved:
+    case fft_array_type_hermitian_interleaved:
+        var_size *= 2;
+        break;
+    default:
+        break;
+    }
+    return var_size;
+}
+
+// count the number of total iterations for 1-, 2-, and 3-D dimensions
+template <typename T1>
+size_t count_iters(const T1& i)
+{
+    return i;
+}
+
+template <typename T1>
+size_t count_iters(const std::tuple<T1, T1>& i)
+{
+    return std::get<0>(i) * std::get<1>(i);
+}
+
+template <typename T1>
+size_t count_iters(const std::tuple<T1, T1, T1>& i)
+{
+    return std::get<0>(i) * std::get<1>(i) * std::get<2>(i);
+}
+
+// Given an array type and transform length, strides, etc, load random floats in [0,1]
+// into the input array of floats/doubles or complex floats/doubles gpu buffers.
+template <typename Tfloat, typename Tint1>
+inline void set_input(std::vector<gpubuf>&       input,
+                      const fft_array_type       itype,
+                      const std::vector<size_t>& length,
+                      const std::vector<size_t>& ilength,
+                      const std::vector<size_t>& stride,
+                      const Tint1&               whole_length,
+                      const Tint1&               istride,
+                      const size_t               idist,
+                      const size_t               nbatch)
+{
+    auto isize = count_iters(whole_length) * nbatch;
+
+    switch(itype)
+    {
+    case fft_array_type_complex_interleaved:
+    case fft_array_type_hermitian_interleaved:
+    {
+
+        auto ibuffer = (rocfft_complex<Tfloat>*)input[0].data();
+        generate_interleaved_data(whole_length, idist, isize, istride, ibuffer);
+
+        if(itype == fft_array_type_hermitian_interleaved)
+        {
+            auto ibuffer_2 = (rocfft_complex<Tfloat>*)input[0].data();
+            impose_hermitian_symmetry_interleaved(
+                length, ilength, stride, idist, nbatch, ibuffer_2);
+        }
+
+        break;
+    }
+    case fft_array_type_complex_planar:
+    case fft_array_type_hermitian_planar:
+    {
+        auto ibuffer_real = (Tfloat*)input[0].data();
+        auto ibuffer_imag = (Tfloat*)input[1].data();
+
+        generate_planar_data(whole_length, idist, isize, istride, ibuffer_real, ibuffer_imag);
+
+        if(itype == fft_array_type_hermitian_planar)
+            impose_hermitian_symmetry_planar(
+                length, ilength, stride, idist, nbatch, ibuffer_real, ibuffer_imag);
+
+        break;
+    }
+    case fft_array_type_real:
+    {
+        auto ibuffer = (Tfloat*)input[0].data();
+
+        generate_real_data(whole_length, idist, isize, istride, ibuffer);
+
+        break;
+    }
+    default:
+        throw std::runtime_error("Input layout format not yet supported");
+    }
+}
+
+// unroll set_input for dimension 1, 2, 3
+template <typename Tfloat>
+inline void set_input(std::vector<gpubuf>&       input,
+                      const fft_array_type       itype,
+                      const std::vector<size_t>& length,
+                      const std::vector<size_t>& ilength,
+                      const std::vector<size_t>& istride,
+                      const size_t               idist,
+                      const size_t               nbatch)
+{
+    switch(length.size())
+    {
+    case 1:
+        set_input<Tfloat>(
+            input, itype, length, ilength, istride, ilength[0], istride[0], idist, nbatch);
+        break;
+    case 2:
+        set_input<Tfloat>(input,
+                          itype,
+                          length,
+                          ilength,
+                          istride,
+                          std::make_tuple(ilength[0], ilength[1]),
+                          std::make_tuple(istride[0], istride[1]),
+                          idist,
+                          nbatch);
+        break;
+    case 3:
+        set_input<Tfloat>(input,
+                          itype,
+                          length,
+                          ilength,
+                          istride,
+                          std::make_tuple(ilength[0], ilength[1], ilength[2]),
+                          std::make_tuple(istride[0], istride[1], istride[2]),
+                          idist,
+                          nbatch);
+        break;
+    default:
+        abort();
+    }
+}
+
+// Container class for test parameters.
+class fft_params
+{
+public:
+    // All parameters are row-major.
+    std::vector<size_t>  length;
+    std::vector<size_t>  istride;
+    std::vector<size_t>  ostride;
+    size_t               nbatch         = 1;
+    fft_precision        precision      = fft_precision_single;
+    fft_transform_type   transform_type = fft_transform_type_complex_forward;
+    fft_result_placement placement      = fft_placement_inplace;
+    size_t               idist          = 0;
+    size_t               odist          = 0;
+    fft_array_type       itype          = fft_array_type_unset;
+    fft_array_type       otype          = fft_array_type_unset;
+    std::vector<size_t>  ioffset        = {0, 0};
+    std::vector<size_t>  ooffset        = {0, 0};
+
+    std::vector<size_t> isize;
+    std::vector<size_t> osize;
+
+    size_t workbuffersize = 0;
+
+    // run testing load/store callbacks
+    bool                    run_callbacks   = false;
+    static constexpr double load_cb_scalar  = 0.457813941;
+    static constexpr double store_cb_scalar = 0.391504938;
+
+    // Check that data outside of output strides is not overwritten.
+    // This is only set explicitly on some tests where there's space
+    // between dimensions, but the dimensions are still in-order.
+    // We're not trying to generically find holes in arbitrary data
+    // layouts.
+    //
+    // NOTE: this flag is not included in tokens, since it doesn't
+    // affect how the FFT library behaves.
+    bool check_output_strides = false;
+
+    // scaling factor - we do a pointwise multiplication of outputs by
+    // this factor
+    double scale_factor = 1.0;
+
+    fft_params(){};
+    virtual ~fft_params(){};
+
+    // Given an array type, return the name as a string.
+    static std::string array_type_name(const fft_array_type type, bool verbose = true)
+    {
+        switch(type)
+        {
+        case fft_array_type_complex_interleaved:
+            return verbose ? "fft_array_type_complex_interleaved" : "CI";
+        case fft_array_type_complex_planar:
+            return verbose ? "fft_array_type_complex_planar" : "CP";
+        case fft_array_type_real:
+            return verbose ? "fft_array_type_real" : "R";
+        case fft_array_type_hermitian_interleaved:
+            return verbose ? "fft_array_type_hermitian_interleaved" : "HI";
+        case fft_array_type_hermitian_planar:
+            return verbose ? "fft_array_type_hermitian_planar" : "HP";
+        case fft_array_type_unset:
+            return verbose ? "fft_array_type_unset" : "UN";
+        }
+        return "";
+    }
+
+    std::string transform_type_name() const
+    {
+        switch(transform_type)
+        {
+        case fft_transform_type_complex_forward:
+            return "fft_transform_type_complex_forward";
+        case fft_transform_type_complex_inverse:
+            return "fft_transform_type_complex_inverse";
+        case fft_transform_type_real_forward:
+            return "fft_transform_type_real_forward";
+        case fft_transform_type_real_inverse:
+            return "fft_transform_type_real_inverse";
+        default:
+            throw std::runtime_error("Invalid transform type");
+        }
+    }
+
+    // Convert to string for output.
+    std::string str(const std::string& separator = ", ") const
+    {
+        std::stringstream ss;
+        ss << "length:";
+        for(auto i : length)
+            ss << " " << i;
+        ss << separator;
+        ss << "istride:";
+        for(auto i : istride)
+            ss << " " << i;
+        ss << separator;
+        ss << "idist: " << idist << separator;
+
+        ss << "ostride:";
+        for(auto i : ostride)
+            ss << " " << i;
+        ss << separator;
+        ss << "odist: " << odist << separator;
+
+        ss << "batch: " << nbatch << separator;
+        ss << "isize:";
+        for(auto i : isize)
+            ss << " " << i;
+        ss << separator;
+        ss << "osize:";
+        for(auto i : osize)
+            ss << " " << i;
+        ss << separator;
+
+        ss << "ioffset:";
+        for(auto i : ioffset)
+            ss << " " << i;
+        ss << separator;
+        ss << "ooffset:";
+        for(auto i : ooffset)
+            ss << " " << i;
+        ss << separator;
+
+        if(placement == fft_placement_inplace)
+            ss << "in-place";
+        else
+            ss << "out-of-place";
+        ss << separator;
+        ss << "transform_type: " << transform_type_name() << separator;
+        ss << array_type_name(itype) << " -> " << array_type_name(otype) << separator;
+        switch(precision)
+        {
+        case fft_precision_half:
+            ss << "half-precision";
+            break;
+        case fft_precision_single:
+            ss << "single-precision";
+            break;
+        case fft_precision_double:
+            ss << "double-precision";
+            break;
+        }
+        ss << separator;
+
+        ss << "ilength:";
+        for(const auto i : ilength())
+            ss << " " << i;
+        ss << separator;
+        ss << "olength:";
+        for(const auto i : olength())
+            ss << " " << i;
+        ss << separator;
+
+        ss << "ibuffer_size:";
+        for(const auto i : ibuffer_sizes())
+            ss << " " << i;
+        ss << separator;
+
+        ss << "obuffer_size:";
+        for(const auto i : obuffer_sizes())
+            ss << " " << i;
+        ss << separator;
+
+        if(scale_factor != 1.0)
+            ss << "scale factor: " << scale_factor << separator;
+
+        return ss.str();
+    }
+
+    // Produce a stringified token of the test fft params.
+    std::string token() const
+    {
+        std::string ret;
+
+        switch(transform_type)
+        {
+        case fft_transform_type_complex_forward:
+            ret += "complex_forward_";
+            break;
+        case fft_transform_type_complex_inverse:
+            ret += "complex_inverse_";
+            break;
+        case fft_transform_type_real_forward:
+            ret += "real_forward_";
+            break;
+        case fft_transform_type_real_inverse:
+            ret += "real_inverse_";
+            break;
+        }
+
+        ret += "len_";
+
+        for(auto n : length)
+        {
+            ret += std::to_string(n);
+            ret += "_";
+        }
+        switch(precision)
+        {
+        case fft_precision_half:
+            ret += "half_";
+            break;
+        case fft_precision_single:
+            ret += "single_";
+            break;
+        case fft_precision_double:
+            ret += "double_";
+            break;
+        }
+
+        switch(placement)
+        {
+        case fft_placement_inplace:
+            ret += "ip_";
+            break;
+        case fft_placement_notinplace:
+            ret += "op_";
+            break;
+        }
+
+        ret += "batch_";
+        ret += std::to_string(nbatch);
+
+        auto append_array_info = [&ret](const std::vector<size_t>& stride, fft_array_type type) {
+            for(auto s : stride)
+            {
+                ret += std::to_string(s);
+                ret += "_";
+            }
+
+            switch(type)
+            {
+            case fft_array_type_complex_interleaved:
+                ret += "CI";
+                break;
+            case fft_array_type_complex_planar:
+                ret += "CP";
+                break;
+            case fft_array_type_real:
+                ret += "R";
+                break;
+            case fft_array_type_hermitian_interleaved:
+                ret += "HI";
+                break;
+            case fft_array_type_hermitian_planar:
+                ret += "HP";
+                break;
+            default:
+                ret += "UN";
+                break;
+            }
+        };
+
+        ret += "_istride_";
+        append_array_info(istride, itype);
+
+        ret += "_ostride_";
+        append_array_info(ostride, otype);
+
+        ret += "_idist_";
+        ret += std::to_string(idist);
+        ret += "_odist_";
+        ret += std::to_string(odist);
+
+        ret += "_ioffset";
+        for(auto n : ioffset)
+        {
+            ret += "_";
+            ret += std::to_string(n);
+        }
+
+        ret += "_ooffset";
+        for(auto n : ooffset)
+        {
+            ret += "_";
+            ret += std::to_string(n);
+        }
+
+        if(run_callbacks)
+            ret += "_CB";
+
+        if(scale_factor != 1.0)
+            ret += "_scale";
+
+        return ret;
+    }
+
+    // Set all params from a stringified token.
+    void from_token(std::string token)
+    {
+        std::vector<std::string> vals;
+
+        std::string delimiter = "_";
+        {
+            size_t pos = 0;
+            while((pos = token.find(delimiter)) != std::string::npos)
+            {
+                auto val = token.substr(0, pos);
+                vals.push_back(val);
+                token.erase(0, pos + delimiter.length());
+            }
+            vals.push_back(token);
+        }
+
+        auto vector_parser
+            = [](const std::vector<std::string>& vals, const std::string token, size_t& pos) {
+                  if(vals[pos++] != token)
+                      throw std::runtime_error("Unable to parse token");
+                  std::vector<size_t> vec;
+
+                  while(pos < vals.size())
+                  {
+                      if(std::all_of(vals[pos].begin(), vals[pos].end(), ::isdigit))
+                      {
+                          vec.push_back(std::stoull(vals[pos++]));
+                      }
+                      else
+                      {
+                          break;
+                      }
+                  }
+                  return vec;
+              };
+
+        auto type_parser = [](const std::string& val) {
+            if(val == "CI")
+                return fft_array_type_complex_interleaved;
+            else if(val == "CP")
+                return fft_array_type_complex_planar;
+            else if(val == "R")
+                return fft_array_type_real;
+            else if(val == "HI")
+                return fft_array_type_hermitian_interleaved;
+            else if(val == "HP")
+                return fft_array_type_hermitian_planar;
+            return fft_array_type_unset;
+        };
+
+        size_t pos = 0;
+
+        bool complex = vals[pos++] == "complex";
+        bool forward = vals[pos++] == "forward";
+
+        if(complex && forward)
+            transform_type = fft_transform_type_complex_forward;
+        if(complex && !forward)
+            transform_type = fft_transform_type_complex_inverse;
+        if(!complex && forward)
+            transform_type = fft_transform_type_real_forward;
+        if(!complex && !forward)
+            transform_type = fft_transform_type_real_inverse;
+
+        length = vector_parser(vals, "len", pos);
+
+        if(vals[pos] == "half")
+            precision = fft_precision_half;
+        else if(vals[pos] == "single")
+            precision = fft_precision_single;
+        else if(vals[pos] == "double")
+            precision = fft_precision_double;
+        pos++;
+
+        placement = (vals[pos++] == "ip") ? fft_placement_inplace : fft_placement_notinplace;
+
+        if(vals[pos++] != "batch")
+            throw std::runtime_error("Unable to parse token");
+        nbatch = std::stoull(vals[pos++]);
+
+        istride = vector_parser(vals, "istride", pos);
+
+        itype = type_parser(vals[pos]);
+        pos++;
+
+        ostride = vector_parser(vals, "ostride", pos);
+
+        otype = type_parser(vals[pos]);
+        pos++;
+
+        if(vals[pos++] != "idist")
+            throw std::runtime_error("Unable to parse token");
+        idist = std::stoull(vals[pos++]);
+
+        if(vals[pos++] != "odist")
+            throw std::runtime_error("Unable to parse token");
+        odist = std::stoull(vals[pos++]);
+
+        ioffset = vector_parser(vals, "ioffset", pos);
+
+        ooffset = vector_parser(vals, "ooffset", pos);
+
+        if(pos < vals.size() && vals[pos] == "CB")
+        {
+            run_callbacks = true;
+            ++pos;
+        }
+
+        if(pos < vals.size() && vals[pos] == "scale")
+        {
+            // just pick some factor that's not zero or one
+            scale_factor = 0.1239;
+            ++pos;
+        }
+    }
+
+    // Stream output operator (for gtest, etc).
+    friend std::ostream& operator<<(std::ostream& stream, const fft_params& params)
+    {
+        stream << params.str();
+        return stream;
+    }
+
+    // Dimension of the transform.
+    size_t dim() const
+    {
+        return length.size();
+    }
+
+    virtual std::vector<size_t> ilength() const
+    {
+        auto ilength = length;
+        if(transform_type == fft_transform_type_real_inverse)
+            ilength[dim() - 1] = ilength[dim() - 1] / 2 + 1;
+        return ilength;
+    }
+
+    virtual std::vector<size_t> olength() const
+    {
+        auto olength = length;
+        if(transform_type == fft_transform_type_real_forward)
+            olength[dim() - 1] = olength[dim() - 1] / 2 + 1;
+        return olength;
+    }
+
+    static size_t nbuffer(const fft_array_type type)
+    {
+        switch(type)
+        {
+        case fft_array_type_real:
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_hermitian_interleaved:
+            return 1;
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_planar:
+            return 2;
+        case fft_array_type_unset:
+            return 0;
+        }
+        return 0;
+    }
+
+    // Number of input buffers
+    size_t nibuffer() const
+    {
+        return nbuffer(itype);
+    }
+
+    // Number of output buffers
+    size_t nobuffer() const
+    {
+        return nbuffer(otype);
+    }
+
+    void set_iotypes()
+    {
+        if(itype == fft_array_type_unset)
+        {
+            switch(transform_type)
+            {
+            case fft_transform_type_complex_forward:
+            case fft_transform_type_complex_inverse:
+                itype = fft_array_type_complex_interleaved;
+                break;
+            case fft_transform_type_real_forward:
+                itype = fft_array_type_real;
+                break;
+            case fft_transform_type_real_inverse:
+                itype = fft_array_type_hermitian_interleaved;
+                break;
+            default:
+                throw std::runtime_error("Invalid transform type");
+            }
+        }
+        if(otype == fft_array_type_unset)
+        {
+            switch(transform_type)
+            {
+            case fft_transform_type_complex_forward:
+            case fft_transform_type_complex_inverse:
+                otype = fft_array_type_complex_interleaved;
+                break;
+            case fft_transform_type_real_forward:
+                otype = fft_array_type_hermitian_interleaved;
+                break;
+            case fft_transform_type_real_inverse:
+                otype = fft_array_type_real;
+                break;
+            default:
+                throw std::runtime_error("Invalid transform type");
+            }
+        }
+    }
+
+    // Check that the input and output types are consistent.
+    bool check_iotypes() const
+    {
+        switch(itype)
+        {
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_interleaved:
+        case fft_array_type_hermitian_planar:
+        case fft_array_type_real:
+            break;
+        default:
+            throw std::runtime_error("Invalid Input array type format");
+        }
+
+        switch(otype)
+        {
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_interleaved:
+        case fft_array_type_hermitian_planar:
+        case fft_array_type_real:
+            break;
+        default:
+            throw std::runtime_error("Invalid Input array type format");
+        }
+
+        // Check that format choices are supported
+        if(transform_type != fft_transform_type_real_forward
+           && transform_type != fft_transform_type_real_inverse)
+        {
+            if(placement == fft_placement_inplace && itype != otype)
+            {
+                throw std::runtime_error(
+                    "In-place transforms must have identical input and output types");
+            }
+        }
+
+        bool okformat = true;
+        switch(itype)
+        {
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_complex_planar:
+            okformat = (otype == fft_array_type_complex_interleaved
+                        || otype == fft_array_type_complex_planar);
+            break;
+        case fft_array_type_hermitian_interleaved:
+        case fft_array_type_hermitian_planar:
+            okformat = otype == fft_array_type_real;
+            break;
+        case fft_array_type_real:
+            okformat = (otype == fft_array_type_hermitian_interleaved
+                        || otype == fft_array_type_hermitian_planar);
+            break;
+        default:
+            throw std::runtime_error("Invalid Input array type format");
+        }
+
+        return okformat;
+    }
+
+    // Given a length vector, set the rest of the strides.
+    // The optional argument stride0 sets the stride for the contiguous dimension.
+    // The optional rcpadding argument sets the stride correctly for in-place
+    // multi-dimensional real/complex transforms.
+    // Format is row-major.
+    template <typename T1>
+    std::vector<T1> compute_stride(const std::vector<T1>&     length,
+                                   const std::vector<size_t>& stride0   = std::vector<size_t>(),
+                                   const bool                 rcpadding = false) const
+    {
+        std::vector<T1> stride(dim());
+
+        size_t dimoffset = 0;
+
+        if(stride0.size() == 0)
+        {
+            // Set the contiguous stride:
+            stride[dim() - 1] = 1;
+            dimoffset         = 1;
+        }
+        else
+        {
+            // Copy the input values to the end of the stride array:
+            for(size_t i = 0; i < stride0.size(); ++i)
+            {
+                stride[dim() - stride0.size() + i] = stride0[i];
+            }
+        }
+
+        if(stride0.size() < dim())
+        {
+            // Compute any remaining values via recursion.
+            for(size_t i = dim() - dimoffset - stride0.size(); i-- > 0;)
+            {
+                auto lengthip1 = length[i + 1];
+                if(rcpadding && i == dim() - 2)
+                {
+                    lengthip1 = 2 * (lengthip1 / 2 + 1);
+                }
+                stride[i] = stride[i + 1] * lengthip1;
+            }
+        }
+
+        return stride;
+    }
+
+    void compute_istride()
+    {
+        istride = compute_stride(ilength(),
+                                 istride,
+                                 placement == fft_placement_inplace
+                                     && transform_type == fft_transform_type_real_forward);
+    }
+
+    void compute_ostride()
+    {
+        ostride = compute_stride(olength(),
+                                 ostride,
+                                 placement == fft_placement_inplace
+                                     && transform_type == fft_transform_type_real_inverse);
+    }
+
+    virtual void compute_isize()
+    {
+        auto   il  = ilength();
+        size_t val = compute_ptrdiff(il, istride, nbatch, idist);
+        isize.resize(nibuffer());
+        for(unsigned int i = 0; i < isize.size(); ++i)
+        {
+            isize[i] = val + ioffset[i];
+        }
+    }
+
+    virtual void compute_osize()
+    {
+        auto   ol  = olength();
+        size_t val = compute_ptrdiff(ol, ostride, nbatch, odist);
+        osize.resize(nobuffer());
+        for(unsigned int i = 0; i < osize.size(); ++i)
+        {
+            osize[i] = val + ooffset[i];
+        }
+    }
+
+    std::vector<size_t> ibuffer_sizes() const
+    {
+        std::vector<size_t> ibuffer_sizes;
+
+        // In-place real-to-complex transforms need to have enough space in the input buffer to
+        // accomadate the output, which is slightly larger.
+        if(placement == fft_placement_inplace && transform_type == fft_transform_type_real_forward)
+        {
+            return obuffer_sizes();
+        }
+
+        if(isize.empty())
+            return ibuffer_sizes;
+
+        switch(itype)
+        {
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_planar:
+            ibuffer_sizes.resize(2);
+            break;
+        default:
+            ibuffer_sizes.resize(1);
+        }
+        for(unsigned i = 0; i < ibuffer_sizes.size(); i++)
+        {
+            ibuffer_sizes[i] = isize[i] * var_size<size_t>(precision, itype);
+        }
+        return ibuffer_sizes;
+    }
+
+    virtual std::vector<size_t> obuffer_sizes() const
+    {
+        std::vector<size_t> obuffer_sizes;
+
+        if(osize.empty())
+            return obuffer_sizes;
+
+        switch(otype)
+        {
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_planar:
+            obuffer_sizes.resize(2);
+            break;
+        default:
+            obuffer_sizes.resize(1);
+        }
+        for(unsigned i = 0; i < obuffer_sizes.size(); i++)
+        {
+            obuffer_sizes[i] = osize[i] * var_size<size_t>(precision, otype);
+        }
+        return obuffer_sizes;
+    }
+
+    // Compute the idist for a given transform based on the placeness, transform type, and data
+    // layout.
+    size_t compute_idist() const
+    {
+        size_t dist = 0;
+        // In-place 1D transforms need extra dist.
+        if(transform_type == fft_transform_type_real_forward && dim() == 1
+           && placement == fft_placement_inplace)
+        {
+            dist = 2 * (length[0] / 2 + 1) * istride[0];
+            return dist;
+        }
+
+        if(transform_type == fft_transform_type_real_inverse && dim() == 1)
+        {
+            dist = (length[0] / 2 + 1) * istride[0];
+            return dist;
+        }
+
+        dist = (transform_type == fft_transform_type_real_inverse)
+                   ? (length[dim() - 1] / 2 + 1) * istride[dim() - 1]
+                   : length[dim() - 1] * istride[dim() - 1];
+        for(unsigned int i = 0; i < dim() - 1; ++i)
+        {
+            dist = std::max(length[i] * istride[i], dist);
+        }
+        return dist;
+    }
+    void set_idist()
+    {
+        if(idist != 0)
+            return;
+        idist = compute_idist();
+    }
+
+    // Compute the odist for a given transform based on the placeness, transform type, and data
+    // layout.  Row-major.
+    size_t compute_odist() const
+    {
+        size_t dist = 0;
+        // In-place 1D transforms need extra dist.
+        if(transform_type == fft_transform_type_real_inverse && dim() == 1
+           && placement == fft_placement_inplace)
+        {
+            dist = 2 * (length[0] / 2 + 1) * ostride[0];
+            return dist;
+        }
+
+        if(transform_type == fft_transform_type_real_forward && dim() == 1)
+        {
+            dist = (length[0] / 2 + 1) * ostride[0];
+            return dist;
+        }
+
+        dist = (transform_type == fft_transform_type_real_forward)
+                   ? (length[dim() - 1] / 2 + 1) * ostride[dim() - 1]
+                   : length[dim() - 1] * ostride[dim() - 1];
+        for(unsigned int i = 0; i < dim() - 1; ++i)
+        {
+            dist = std::max(length[i] * ostride[i], dist);
+        }
+        return dist;
+    }
+    void set_odist()
+    {
+        if(odist != 0)
+            return;
+        odist = compute_odist();
+    }
+
+    // Put the length, stride, batch, and dist into a single length/stride array and pass off to the
+    // validity checker.
+    bool valid_length_stride_batch_dist(const std::vector<size_t>& l0,
+                                        const std::vector<size_t>& s0,
+                                        const size_t               n,
+                                        const size_t               dist,
+                                        const int                  verbose = 0) const
+    {
+        if(l0.size() != s0.size())
+            return false;
+
+        // Length and stride vectors, including bathes:
+        std::vector<size_t> l{}, s{};
+        for(unsigned int i = 0; i < l0.size(); ++i)
+        {
+            if(l0[i] > 1)
+            {
+                if(s0[i] == 0)
+                    return false;
+                l.push_back(l0[i]);
+                s.push_back(s0[i]);
+            }
+        }
+        if(n > 1)
+        {
+            if(dist == 0)
+                return false;
+            l.push_back(n);
+            s.push_back(dist);
+        }
+
+        return array_valid(l, s, verbose);
+    }
+
+    // Return true if the given GPU parameters would produce a valid transform.
+    bool valid(const int verbose) const
+    {
+        if(ioffset.size() < nibuffer() || ooffset.size() < nobuffer())
+            return false;
+
+        // Check that in-place transforms have the same input and output stride:
+        if(placement == fft_placement_inplace)
+        {
+            const auto stridesize = std::min(istride.size(), ostride.size());
+            bool       samestride = true;
+            for(unsigned int i = 0; i < stridesize; ++i)
+            {
+                if(istride[i] != ostride[i])
+                    samestride = false;
+            }
+            if((transform_type == fft_transform_type_complex_forward
+                || transform_type == fft_transform_type_complex_inverse)
+               && !samestride)
+            {
+                // In-place transforms require identical input and output strides.
+                if(verbose)
+                {
+                    std::cout << "istride:";
+                    for(const auto& i : istride)
+                        std::cout << " " << i;
+                    std::cout << " ostride0:";
+                    for(const auto& i : ostride)
+                        std::cout << " " << i;
+                    std::cout << " differ; skipped for in-place transforms: skipping test"
+                              << std::endl;
+                }
+                return false;
+            }
+
+            if((transform_type == fft_transform_type_complex_forward
+                || transform_type == fft_transform_type_complex_inverse)
+               && (idist != odist) && nbatch > 1)
+            {
+                // In-place transforms require identical distance, if
+                // batch > 1.  If batch is 1 then dist is ignored and
+                // the FFT should still work.
+                if(verbose)
+                {
+                    std::cout << "idist:" << idist << " odist:" << odist
+                              << " differ; skipped for in-place transforms: skipping test"
+                              << std::endl;
+                }
+                return false;
+            }
+
+            if((transform_type == fft_transform_type_real_forward
+                || transform_type == fft_transform_type_real_inverse)
+               && (istride.back() != 1 || ostride.back() != 1))
+            {
+                // In-place real/complex transforms require unit strides.
+                if(verbose)
+                {
+                    std::cout
+                        << "istride.back(): " << istride.back()
+                        << " ostride.back(): " << ostride.back()
+                        << " must be unitary for in-place real/complex transforms: skipping test"
+                        << std::endl;
+                }
+                return false;
+            }
+
+            if((itype == fft_array_type_complex_interleaved
+                && otype == fft_array_type_complex_planar)
+               || (itype == fft_array_type_complex_planar
+                   && otype == fft_array_type_complex_interleaved))
+            {
+                if(verbose)
+                {
+                    std::cout << "In-place c2c transforms require identical io types; skipped.\n";
+                }
+                return false;
+            }
+
+            // Check offsets
+            switch(transform_type)
+            {
+            case fft_transform_type_complex_forward:
+            case fft_transform_type_complex_inverse:
+                for(unsigned int i = 0; i < nibuffer(); ++i)
+                {
+                    if(ioffset[i] != ooffset[i])
+                        return false;
+                }
+                break;
+            case fft_transform_type_real_forward:
+                if(ioffset[0] != 2 * ooffset[0])
+                    return false;
+                break;
+            case fft_transform_type_real_inverse:
+                if(2 * ioffset[0] != ooffset[0])
+                    return false;
+                break;
+            }
+        }
+
+        if(!check_iotypes())
+            return false;
+
+        // we can only check output strides on out-of-place
+        // transforms, since we need to initialize output to a known
+        // pattern
+        if(placement == fft_placement_inplace && check_output_strides)
+            return false;
+
+        // Check input and output strides
+        if(valid_length_stride_batch_dist(ilength(), istride, nbatch, idist, verbose) != true)
+        {
+            if(verbose)
+                std::cout << "Invalid input data format.\n";
+            return false;
+        }
+        if(!(ilength() == olength() && istride == ostride && idist == odist))
+        {
+            // Only check if different
+            if(valid_length_stride_batch_dist(olength(), ostride, nbatch, odist, verbose) != true)
+            {
+                if(verbose)
+                    std::cout << "Invalid output data format.\n";
+                return false;
+            }
+        }
+
+        // The parameters are valid.
+        return true;
+    }
+
+    // Fill in any missing parameters.
+    void validate()
+    {
+        set_iotypes();
+        compute_istride();
+        compute_ostride();
+        set_idist();
+        set_odist();
+        compute_isize();
+        compute_osize();
+    }
+
+    // Column-major getters:
+    std::vector<size_t> length_cm() const
+    {
+        auto length_cm = length;
+        std::reverse(std::begin(length_cm), std::end(length_cm));
+        return length_cm;
+    }
+    std::vector<size_t> ilength_cm() const
+    {
+        auto ilength_cm = ilength();
+        std::reverse(std::begin(ilength_cm), std::end(ilength_cm));
+        return ilength_cm;
+    }
+    std::vector<size_t> olength_cm() const
+    {
+        auto olength_cm = olength();
+        std::reverse(std::begin(olength_cm), std::end(olength_cm));
+        return olength_cm;
+    }
+    std::vector<size_t> istride_cm() const
+    {
+        auto istride_cm = istride;
+        std::reverse(std::begin(istride_cm), std::end(istride_cm));
+        return istride_cm;
+    }
+    std::vector<size_t> ostride_cm() const
+    {
+        auto ostride_cm = ostride;
+        std::reverse(std::begin(ostride_cm), std::end(ostride_cm));
+        return ostride_cm;
+    }
+    bool is_planar() const
+    {
+        if(itype == fft_array_type_complex_planar || itype == fft_array_type_hermitian_planar)
+            return true;
+        if(otype == fft_array_type_complex_planar || otype == fft_array_type_hermitian_planar)
+            return true;
+        return false;
+    }
+
+    // Given a data type and dimensions, fill the buffer, imposing Hermitian symmetry if necessary.
+    inline void compute_input(std::vector<gpubuf>& input)
+    {
+        switch(precision)
+        {
+        case fft_precision_half:
+            set_input<_Float16>(input, itype, length, ilength(), istride, idist, nbatch);
+            break;
+        case fft_precision_double:
+            set_input<double>(input, itype, length, ilength(), istride, idist, nbatch);
+            break;
+        case fft_precision_single:
+            set_input<float>(input, itype, length, ilength(), istride, idist, nbatch);
+            break;
+        }
+    }
+
+    template <typename Tstream = std::ostream>
+    void print_ibuffer(const std::vector<hostbuf>& buf, Tstream& stream = std::cout) const
+    {
+        switch(itype)
+        {
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_hermitian_interleaved:
+        {
+            switch(precision)
+            {
+            case fft_precision_half:
+            {
+                buffer_printer<rocfft_complex<_Float16>> s;
+                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
+                break;
+            }
+            case fft_precision_single:
+            {
+                buffer_printer<rocfft_complex<float>> s;
+                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
+                break;
+            }
+            case fft_precision_double:
+            {
+                buffer_printer<rocfft_complex<double>> s;
+                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
+                break;
+            }
+            }
+            break;
+        }
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_planar:
+        case fft_array_type_real:
+        {
+            switch(precision)
+            {
+            case fft_precision_half:
+            {
+                buffer_printer<_Float16> s;
+                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
+                break;
+            }
+            case fft_precision_single:
+            {
+                buffer_printer<float> s;
+                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
+                break;
+            }
+            case fft_precision_double:
+            {
+                buffer_printer<double> s;
+                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
+                break;
+            }
+            }
+            break;
+        }
+        default:
+            throw std::runtime_error("Invalid itype in print_ibuffer");
+        }
+    }
+
+    template <typename Tstream = std::ostream>
+    void print_obuffer(const std::vector<hostbuf>& buf, Tstream& stream = std::cout) const
+    {
+        switch(otype)
+        {
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_hermitian_interleaved:
+        {
+            switch(precision)
+            {
+            case fft_precision_half:
+            {
+                buffer_printer<rocfft_complex<_Float16>> s;
+                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
+                break;
+            }
+            case fft_precision_single:
+            {
+                buffer_printer<rocfft_complex<float>> s;
+                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
+                break;
+            }
+            case fft_precision_double:
+                buffer_printer<rocfft_complex<double>> s;
+                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
+                break;
+            }
+            break;
+        }
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_planar:
+        case fft_array_type_real:
+        {
+            switch(precision)
+            {
+            case fft_precision_half:
+            {
+                buffer_printer<_Float16> s;
+                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
+                break;
+            }
+            case fft_precision_single:
+            {
+                buffer_printer<float> s;
+                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
+                break;
+            }
+            case fft_precision_double:
+            {
+                buffer_printer<double> s;
+                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
+                break;
+            }
+            }
+            break;
+        }
+
+        default:
+            throw std::runtime_error("Invalid itype in print_obuffer");
+        }
+    }
+
+    void print_ibuffer_flat(const std::vector<hostbuf>& buf) const
+    {
+        switch(itype)
+        {
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_hermitian_interleaved:
+        {
+            switch(precision)
+            {
+            case fft_precision_half:
+            {
+                buffer_printer<rocfft_complex<_Float16>> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            case fft_precision_single:
+            {
+                buffer_printer<rocfft_complex<float>> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            case fft_precision_double:
+                buffer_printer<rocfft_complex<double>> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            break;
+        }
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_planar:
+        case fft_array_type_real:
+        {
+            switch(precision)
+            {
+            case fft_precision_half:
+            {
+                buffer_printer<_Float16> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            case fft_precision_single:
+            {
+                buffer_printer<float> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            case fft_precision_double:
+            {
+                buffer_printer<double> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            }
+            break;
+        default:
+            throw std::runtime_error("Invalid itype in print_ibuffer_flat");
+        }
+        }
+    }
+
+    void print_obuffer_flat(const std::vector<hostbuf>& buf) const
+    {
+        switch(otype)
+        {
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_hermitian_interleaved:
+        {
+            switch(precision)
+            {
+            case fft_precision_half:
+            {
+                buffer_printer<rocfft_complex<_Float16>> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            case fft_precision_single:
+            {
+                buffer_printer<rocfft_complex<float>> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            case fft_precision_double:
+                buffer_printer<rocfft_complex<double>> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            break;
+        }
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_planar:
+        case fft_array_type_real:
+        {
+            switch(precision)
+            {
+            case fft_precision_half:
+            {
+                buffer_printer<_Float16> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            case fft_precision_single:
+            {
+                buffer_printer<float> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+
+            case fft_precision_double:
+            {
+                buffer_printer<double> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            }
+            break;
+        default:
+            throw std::runtime_error("Invalid itype in print_ibuffer_flat");
+        }
+        }
+    }
+
+    virtual fft_status set_callbacks(void* load_cb_host,
+                                     void* load_cb_data,
+                                     void* store_cb_host,
+                                     void* store_cb_data)
+    {
+        return fft_status_success;
+    }
+
+    virtual fft_status execute(void** in, void** out)
+    {
+        return fft_status_success;
+    };
+
+    size_t fft_params_vram_footprint()
+    {
+        return fft_params::vram_footprint();
+    }
+
+    virtual size_t vram_footprint()
+    {
+        const auto ibuf_size = ibuffer_sizes();
+        size_t     val       = std::accumulate(ibuf_size.begin(), ibuf_size.end(), (size_t)1);
+        if(placement == fft_placement_notinplace)
+        {
+            const auto obuf_size = obuffer_sizes();
+            val += std::accumulate(obuf_size.begin(), obuf_size.end(), (size_t)1);
+        }
+        return val;
+    }
+
+    // Specific exception type for work buffer allocation failure.
+    // Tests that hit this can't fit on the GPU and should be skipped.
+    struct work_buffer_alloc_failure : public std::runtime_error
+    {
+        work_buffer_alloc_failure(const std::string& s)
+            : std::runtime_error(s)
+        {
+        }
+    };
+
+    virtual fft_status create_plan()
+    {
+        return fft_status_success;
+    }
+
+    // Change a forward transform to it's inverse
+    void inverse_from_forward(fft_params& params_forward)
+    {
+        switch(params_forward.transform_type)
+        {
+        case fft_transform_type_complex_forward:
+            transform_type = fft_transform_type_complex_inverse;
+            break;
+        case fft_transform_type_real_forward:
+            transform_type = fft_transform_type_real_inverse;
+            break;
+        default:
+            throw std::runtime_error("Transform type not forward.");
+        }
+
+        length    = params_forward.length;
+        istride   = params_forward.ostride;
+        ostride   = params_forward.istride;
+        nbatch    = params_forward.nbatch;
+        precision = params_forward.precision;
+        placement = params_forward.placement;
+        idist     = params_forward.odist;
+        odist     = params_forward.idist;
+        itype     = params_forward.otype;
+        otype     = params_forward.itype;
+        ioffset   = params_forward.ooffset;
+        ooffset   = params_forward.ioffset;
+
+        run_callbacks = params_forward.run_callbacks;
+
+        check_output_strides = params_forward.check_output_strides;
+
+        scale_factor = 1 / params_forward.scale_factor;
+    }
+};
+
+// This is used with the program_options class so that the user can type an integer on the
+// command line and we store into an enum varaible
+template <typename _Elem, typename _Traits>
+std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream,
+                                               fft_array_type&                     atype)
+{
+    unsigned tmp;
+    stream >> tmp;
+    atype = fft_array_type(tmp);
+    return stream;
+}
+
+// similarly for transform type
+template <typename _Elem, typename _Traits>
+std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream,
+                                               fft_transform_type&                 ttype)
+{
+    unsigned tmp;
+    stream >> tmp;
+    ttype = fft_transform_type(tmp);
+    return stream;
+}
+
+// Work out how many partitions to break our iteration problem into
+template <typename T1>
+static size_t compute_partition_count(T1 length)
+{
+#ifdef _OPENMP
+    // we seem to get contention from too many threads, which slows
+    // things down.  particularly noticeable with mix_3D tests
+    static const size_t MAX_PARTITIONS = 8;
+    size_t              iters          = count_iters(length);
+    size_t hw_threads = std::min(MAX_PARTITIONS, static_cast<size_t>(omp_get_num_procs()));
+    if(!hw_threads)
+        return 1;
+
+    // don't bother threading problem sizes that are too small. pick
+    // an arbitrary number of iterations and ensure that each thread
+    // has at least that many iterations to process
+    static const size_t MIN_ITERS_PER_THREAD = 2048;
+
+    // either use the whole CPU, or use ceil(iters/iters_per_thread)
+    return std::min(hw_threads, (iters + MIN_ITERS_PER_THREAD + 1) / MIN_ITERS_PER_THREAD);
+#else
+    return 1;
+#endif
+}
+
+// Break a scalar length into some number of pieces, returning
+// [(start0, end0), (start1, end1), ...]
+template <typename T1>
+std::vector<std::pair<T1, T1>> partition_base(const T1& length, size_t num_parts)
+{
+    static_assert(std::is_integral<T1>::value, "Integral required.");
+
+    // make sure we don't exceed the length
+    num_parts = std::min(length, num_parts);
+
+    std::vector<std::pair<T1, T1>> ret(num_parts);
+    auto                           partition_size = length / num_parts;
+    T1                             cur_partition  = 0;
+    for(size_t i = 0; i < num_parts; ++i, cur_partition += partition_size)
+    {
+        ret[i].first  = cur_partition;
+        ret[i].second = cur_partition + partition_size;
+    }
+    // last partition might not divide evenly, fix it up
+    ret.back().second = length;
+    return ret;
+}
+
+// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths
+template <typename T1>
+std::vector<std::pair<T1, T1>> partition_rowmajor(const T1& length)
+{
+    return partition_base(length, compute_partition_count(length));
+}
+
+// Partition on the leftmost part of the tuple, for row-major indexing
+template <typename T1>
+std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>>
+    partition_rowmajor(const std::tuple<T1, T1>& length)
+{
+    auto partitions = partition_base(std::get<0>(length), compute_partition_count(length));
+    std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>> ret(partitions.size());
+    for(size_t i = 0; i < partitions.size(); ++i)
+    {
+        std::get<0>(ret[i].first)  = partitions[i].first;
+        std::get<1>(ret[i].first)  = 0;
+        std::get<0>(ret[i].second) = partitions[i].second;
+        std::get<1>(ret[i].second) = std::get<1>(length);
+    }
+    return ret;
+}
+template <typename T1>
+std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>>
+    partition_rowmajor(const std::tuple<T1, T1, T1>& length)
+{
+    auto partitions = partition_base(std::get<0>(length), compute_partition_count(length));
+    std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>> ret(partitions.size());
+    for(size_t i = 0; i < partitions.size(); ++i)
+    {
+        std::get<0>(ret[i].first)  = partitions[i].first;
+        std::get<1>(ret[i].first)  = 0;
+        std::get<2>(ret[i].first)  = 0;
+        std::get<0>(ret[i].second) = partitions[i].second;
+        std::get<1>(ret[i].second) = std::get<1>(length);
+        std::get<2>(ret[i].second) = std::get<2>(length);
+    }
+    return ret;
+}
+
+// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths
+template <typename T1>
+std::vector<std::pair<T1, T1>> partition_colmajor(const T1& length)
+{
+    return partition_base(length, compute_partition_count(length));
+}
+
+// Partition on the rightmost part of the tuple, for col-major indexing
+template <typename T1>
+std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>>
+    partition_colmajor(const std::tuple<T1, T1>& length)
+{
+    auto partitions = partition_base(std::get<1>(length), compute_partition_count(length));
+    std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>> ret(partitions.size());
+    for(size_t i = 0; i < partitions.size(); ++i)
+    {
+        std::get<1>(ret[i].first)  = partitions[i].first;
+        std::get<0>(ret[i].first)  = 0;
+        std::get<1>(ret[i].second) = partitions[i].second;
+        std::get<0>(ret[i].second) = std::get<0>(length);
+    }
+    return ret;
+}
+template <typename T1>
+std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>>
+    partition_colmajor(const std::tuple<T1, T1, T1>& length)
+{
+    auto partitions = partition_base(std::get<2>(length), compute_partition_count(length));
+    std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>> ret(partitions.size());
+    for(size_t i = 0; i < partitions.size(); ++i)
+    {
+        std::get<2>(ret[i].first)  = partitions[i].first;
+        std::get<1>(ret[i].first)  = 0;
+        std::get<0>(ret[i].first)  = 0;
+        std::get<2>(ret[i].second) = partitions[i].second;
+        std::get<1>(ret[i].second) = std::get<1>(length);
+        std::get<0>(ret[i].second) = std::get<0>(length);
+    }
+    return ret;
+}
+
+// Specialized computation of index given 1-, 2-, 3- dimension length + stride
+template <typename T1, typename T2>
+size_t compute_index(T1 length, T2 stride, size_t base)
+{
+    return (length * stride) + base;
+}
+
+template <typename T1, typename T2>
+size_t
+    compute_index(const std::tuple<T1, T1>& length, const std::tuple<T2, T2>& stride, size_t base)
+{
+    static_assert(std::is_integral<T1>::value, "Integral required.");
+    static_assert(std::is_integral<T2>::value, "Integral required.");
+    return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride))
+           + base;
+}
+
+template <typename T1, typename T2>
+size_t compute_index(const std::tuple<T1, T1, T1>& length,
+                     const std::tuple<T2, T2, T2>& stride,
+                     size_t                        base)
+{
+    static_assert(std::is_integral<T1>::value, "Integral required.");
+    static_assert(std::is_integral<T2>::value, "Integral required.");
+    return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride))
+           + (std::get<2>(length) * std::get<2>(stride)) + base;
+}
+
+// Copy data of dimensions length with strides istride and length idist between batches to
+// a buffer with strides ostride and length odist between batches.  The input and output
+// types are identical.
+template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
+inline void copy_buffers_1to1(const Tval*                input,
+                              Tval*                      output,
+                              const Tint1&               whole_length,
+                              const size_t               nbatch,
+                              const Tint2&               istride,
+                              const size_t               idist,
+                              const Tint3&               ostride,
+                              const size_t               odist,
+                              const std::vector<size_t>& ioffset,
+                              const std::vector<size_t>& ooffset)
+{
+    const bool idx_equals_odx = istride == ostride && idist == odist;
+    size_t     idx_base       = 0;
+    size_t     odx_base       = 0;
+    auto       partitions     = partition_rowmajor(whole_length);
+    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for num_threads(partitions.size())
+#endif
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            auto       index  = partitions[part].first;
+            const auto length = partitions[part].second;
+            do
+            {
+                const auto idx = compute_index(index, istride, idx_base);
+                const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
+                output[odx + ooffset[0]] = input[idx + ioffset[0]];
+            } while(increment_rowmajor(index, length));
+        }
+    }
+}
+
+// Copy data of dimensions length with strides istride and length idist between batches to
+// a buffer with strides ostride and length odist between batches.  The input type is
+// planar and the output type is complex interleaved.
+template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
+inline void copy_buffers_2to1(const Tval*                input0,
+                              const Tval*                input1,
+                              rocfft_complex<Tval>*      output,
+                              const Tint1&               whole_length,
+                              const size_t               nbatch,
+                              const Tint2&               istride,
+                              const size_t               idist,
+                              const Tint3&               ostride,
+                              const size_t               odist,
+                              const std::vector<size_t>& ioffset,
+                              const std::vector<size_t>& ooffset)
+{
+    const bool idx_equals_odx = istride == ostride && idist == odist;
+    size_t     idx_base       = 0;
+    size_t     odx_base       = 0;
+    auto       partitions     = partition_rowmajor(whole_length);
+    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for num_threads(partitions.size())
+#endif
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            auto       index  = partitions[part].first;
+            const auto length = partitions[part].second;
+            do
+            {
+                const auto idx = compute_index(index, istride, idx_base);
+                const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
+                output[odx + ooffset[0]]
+                    = rocfft_complex<Tval>(input0[idx + ioffset[0]], input1[idx + ioffset[1]]);
+            } while(increment_rowmajor(index, length));
+        }
+    }
+}
+
+// Copy data of dimensions length with strides istride and length idist between batches to
+// a buffer with strides ostride and length odist between batches.  The input type is
+// complex interleaved and the output type is planar.
+template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
+inline void copy_buffers_1to2(const rocfft_complex<Tval>* input,
+                              Tval*                       output0,
+                              Tval*                       output1,
+                              const Tint1&                whole_length,
+                              const size_t                nbatch,
+                              const Tint2&                istride,
+                              const size_t                idist,
+                              const Tint3&                ostride,
+                              const size_t                odist,
+                              const std::vector<size_t>&  ioffset,
+                              const std::vector<size_t>&  ooffset)
+{
+    const bool idx_equals_odx = istride == ostride && idist == odist;
+    size_t     idx_base       = 0;
+    size_t     odx_base       = 0;
+    auto       partitions     = partition_rowmajor(whole_length);
+    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for num_threads(partitions.size())
+#endif
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            auto       index  = partitions[part].first;
+            const auto length = partitions[part].second;
+            do
+            {
+                const auto idx = compute_index(index, istride, idx_base);
+                const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
+                output0[odx + ooffset[0]] = input[idx + ioffset[0]].real();
+                output1[odx + ooffset[1]] = input[idx + ioffset[0]].imag();
+            } while(increment_rowmajor(index, length));
+        }
+    }
+}
+
+// Copy data of dimensions length with strides istride and length idist between batches to
+// a buffer with strides ostride and length odist between batches.  The input type given
+// by itype, and the output type is given by otype.
+template <typename Tint1, typename Tint2, typename Tint3>
+inline void copy_buffers(const std::vector<hostbuf>& input,
+                         std::vector<hostbuf>&       output,
+                         const Tint1&                length,
+                         const size_t                nbatch,
+                         const fft_precision         precision,
+                         const fft_array_type        itype,
+                         const Tint2&                istride,
+                         const size_t                idist,
+                         const fft_array_type        otype,
+                         const Tint3&                ostride,
+                         const size_t                odist,
+                         const std::vector<size_t>&  ioffset,
+                         const std::vector<size_t>&  ooffset)
+{
+    if(itype == otype)
+    {
+        switch(itype)
+        {
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_hermitian_interleaved:
+            switch(precision)
+            {
+            case fft_precision_half:
+                copy_buffers_1to1(
+                    reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
+                    reinterpret_cast<rocfft_complex<_Float16>*>(output[0].data()),
+                    length,
+                    nbatch,
+                    istride,
+                    idist,
+                    ostride,
+                    odist,
+                    ioffset,
+                    ooffset);
+                break;
+            case fft_precision_single:
+                copy_buffers_1to1(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
+                                  reinterpret_cast<rocfft_complex<float>*>(output[0].data()),
+                                  length,
+                                  nbatch,
+                                  istride,
+                                  idist,
+                                  ostride,
+                                  odist,
+                                  ioffset,
+                                  ooffset);
+                break;
+            case fft_precision_double:
+                copy_buffers_1to1(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
+                                  reinterpret_cast<rocfft_complex<double>*>(output[0].data()),
+                                  length,
+                                  nbatch,
+                                  istride,
+                                  idist,
+                                  ostride,
+                                  odist,
+                                  ioffset,
+                                  ooffset);
+                break;
+            }
+            break;
+        case fft_array_type_real:
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_planar:
+            for(unsigned int idx = 0; idx < input.size(); ++idx)
+            {
+                switch(precision)
+                {
+                case fft_precision_half:
+                    copy_buffers_1to1(reinterpret_cast<const _Float16*>(input[idx].data()),
+                                      reinterpret_cast<_Float16*>(output[idx].data()),
+                                      length,
+                                      nbatch,
+                                      istride,
+                                      idist,
+                                      ostride,
+                                      odist,
+                                      ioffset,
+                                      ooffset);
+                    break;
+                case fft_precision_single:
+                    copy_buffers_1to1(reinterpret_cast<const float*>(input[idx].data()),
+                                      reinterpret_cast<float*>(output[idx].data()),
+                                      length,
+                                      nbatch,
+                                      istride,
+                                      idist,
+                                      ostride,
+                                      odist,
+                                      ioffset,
+                                      ooffset);
+                    break;
+                case fft_precision_double:
+                    copy_buffers_1to1(reinterpret_cast<const double*>(input[idx].data()),
+                                      reinterpret_cast<double*>(output[idx].data()),
+                                      length,
+                                      nbatch,
+                                      istride,
+                                      idist,
+                                      ostride,
+                                      odist,
+                                      ioffset,
+                                      ooffset);
+                    break;
+                }
+            }
+            break;
+        default:
+            throw std::runtime_error("Invalid data type");
+        }
+    }
+    else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar)
+            || (itype == fft_array_type_hermitian_interleaved
+                && otype == fft_array_type_hermitian_planar))
+    {
+        // copy 1to2
+        switch(precision)
+        {
+        case fft_precision_half:
+            copy_buffers_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
+                              reinterpret_cast<_Float16*>(output[0].data()),
+                              reinterpret_cast<_Float16*>(output[1].data()),
+                              length,
+                              nbatch,
+                              istride,
+                              idist,
+                              ostride,
+                              odist,
+                              ioffset,
+                              ooffset);
+            break;
+        case fft_precision_single:
+            copy_buffers_1to2(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
+                              reinterpret_cast<float*>(output[0].data()),
+                              reinterpret_cast<float*>(output[1].data()),
+                              length,
+                              nbatch,
+                              istride,
+                              idist,
+                              ostride,
+                              odist,
+                              ioffset,
+                              ooffset);
+            break;
+        case fft_precision_double:
+            copy_buffers_1to2(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
+                              reinterpret_cast<double*>(output[0].data()),
+                              reinterpret_cast<double*>(output[1].data()),
+                              length,
+                              nbatch,
+                              istride,
+                              idist,
+                              ostride,
+                              odist,
+                              ioffset,
+                              ooffset);
+            break;
+        }
+    }
+    else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved)
+            || (itype == fft_array_type_hermitian_planar
+                && otype == fft_array_type_hermitian_interleaved))
+    {
+        // copy 2 to 1
+        switch(precision)
+        {
+        case fft_precision_half:
+            copy_buffers_2to1(reinterpret_cast<const _Float16*>(input[0].data()),
+                              reinterpret_cast<const _Float16*>(input[1].data()),
+                              reinterpret_cast<rocfft_complex<_Float16>*>(output[0].data()),
+                              length,
+                              nbatch,
+                              istride,
+                              idist,
+                              ostride,
+                              odist,
+                              ioffset,
+                              ooffset);
+            break;
+        case fft_precision_single:
+            copy_buffers_2to1(reinterpret_cast<const float*>(input[0].data()),
+                              reinterpret_cast<const float*>(input[1].data()),
+                              reinterpret_cast<rocfft_complex<float>*>(output[0].data()),
+                              length,
+                              nbatch,
+                              istride,
+                              idist,
+                              ostride,
+                              odist,
+                              ioffset,
+                              ooffset);
+            break;
+        case fft_precision_double:
+            copy_buffers_2to1(reinterpret_cast<const double*>(input[0].data()),
+                              reinterpret_cast<const double*>(input[1].data()),
+                              reinterpret_cast<rocfft_complex<double>*>(output[0].data()),
+                              length,
+                              nbatch,
+                              istride,
+                              idist,
+                              ostride,
+                              odist,
+                              ioffset,
+                              ooffset);
+            break;
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Invalid input and output types.");
+    }
+}
+
+// unroll arbitrary-dimension copy_buffers into specializations for 1-, 2-, 3-dimensions
+template <typename Tint1, typename Tint2, typename Tint3>
+inline void copy_buffers(const std::vector<hostbuf>& input,
+                         std::vector<hostbuf>&       output,
+                         const std::vector<Tint1>&   length,
+                         const size_t                nbatch,
+                         const fft_precision         precision,
+                         const fft_array_type        itype,
+                         const std::vector<Tint2>&   istride,
+                         const size_t                idist,
+                         const fft_array_type        otype,
+                         const std::vector<Tint3>&   ostride,
+                         const size_t                odist,
+                         const std::vector<size_t>&  ioffset,
+                         const std::vector<size_t>&  ooffset)
+{
+    switch(length.size())
+    {
+    case 1:
+        return copy_buffers(input,
+                            output,
+                            length[0],
+                            nbatch,
+                            precision,
+                            itype,
+                            istride[0],
+                            idist,
+                            otype,
+                            ostride[0],
+                            odist,
+                            ioffset,
+                            ooffset);
+    case 2:
+        return copy_buffers(input,
+                            output,
+                            std::make_tuple(length[0], length[1]),
+                            nbatch,
+                            precision,
+                            itype,
+                            std::make_tuple(istride[0], istride[1]),
+                            idist,
+                            otype,
+                            std::make_tuple(ostride[0], ostride[1]),
+                            odist,
+                            ioffset,
+                            ooffset);
+    case 3:
+        return copy_buffers(input,
+                            output,
+                            std::make_tuple(length[0], length[1], length[2]),
+                            nbatch,
+                            precision,
+                            itype,
+                            std::make_tuple(istride[0], istride[1], istride[2]),
+                            idist,
+                            otype,
+                            std::make_tuple(ostride[0], ostride[1], ostride[2]),
+                            odist,
+                            ioffset,
+                            ooffset);
+    default:
+        abort();
+    }
+}
+
+// Compute the L-infinity and L-2 distance between two buffers with strides istride and
+// length idist between batches to a buffer with strides ostride and length odist between
+// batches.  Both buffers are of complex type.
+
+struct VectorNorms
+{
+    double l_2 = 0.0, l_inf = 0.0;
+};
+
+template <typename Tcomplex, typename Tint1, typename Tint2, typename Tint3>
+inline VectorNorms distance_1to1_complex(const Tcomplex*                         input,
+                                         const Tcomplex*                         output,
+                                         const Tint1&                            whole_length,
+                                         const size_t                            nbatch,
+                                         const Tint2&                            istride,
+                                         const size_t                            idist,
+                                         const Tint3&                            ostride,
+                                         const size_t                            odist,
+                                         std::vector<std::pair<size_t, size_t>>* linf_failures,
+                                         const double                            linf_cutoff,
+                                         const std::vector<size_t>&              ioffset,
+                                         const std::vector<size_t>&              ooffset,
+                                         const double output_scalar = 1.0)
+{
+    double linf = 0.0;
+    double l2   = 0.0;
+
+    std::mutex                             linf_failure_lock;
+    std::vector<std::pair<size_t, size_t>> linf_failures_private;
+
+    const bool idx_equals_odx = istride == ostride && idist == odist;
+    size_t     idx_base       = 0;
+    size_t     odx_base       = 0;
+    auto       partitions     = partition_colmajor(whole_length);
+    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private)
+#endif
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            double     cur_linf = 0.0;
+            double     cur_l2   = 0.0;
+            auto       index    = partitions[part].first;
+            const auto length   = partitions[part].second;
+
+            do
+            {
+                const auto   idx = compute_index(index, istride, idx_base);
+                const auto   odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
+                const double rdiff
+                    = std::abs(static_cast<double>(output[odx + ooffset[0]].real()) * output_scalar
+                               - static_cast<double>(input[idx + ioffset[0]].real()));
+                cur_linf = std::max(rdiff, cur_linf);
+                if(cur_linf > linf_cutoff)
+                {
+                    std::pair<size_t, size_t> fval(b, idx);
+                    if(linf_failures)
+                        linf_failures_private.push_back(fval);
+                }
+                cur_l2 += rdiff * rdiff;
+
+                const double idiff
+                    = std::abs(static_cast<double>(output[odx + ooffset[0]].imag()) * output_scalar
+                               - static_cast<double>(input[idx + ioffset[0]].imag()));
+                cur_linf = std::max(idiff, cur_linf);
+                if(cur_linf > linf_cutoff)
+                {
+                    std::pair<size_t, size_t> fval(b, idx);
+                    if(linf_failures)
+                        linf_failures_private.push_back(fval);
+                }
+                cur_l2 += idiff * idiff;
+
+            } while(increment_rowmajor(index, length));
+            linf = std::max(linf, cur_linf);
+            l2 += cur_l2;
+
+            if(linf_failures)
+            {
+                linf_failure_lock.lock();
+                std::copy(linf_failures_private.begin(),
+                          linf_failures_private.end(),
+                          std::back_inserter(*linf_failures));
+                linf_failure_lock.unlock();
+            }
+        }
+    }
+    return {.l_2 = sqrt(l2), .l_inf = linf};
+}
+
+// Compute the L-infinity and L-2 distance between two buffers with strides istride and
+// length idist between batches to a buffer with strides ostride and length odist between
+// batches.  Both buffers are of real type.
+template <typename Tfloat, typename Tint1, typename Tint2, typename Tint3>
+inline VectorNorms distance_1to1_real(const Tfloat*                           input,
+                                      const Tfloat*                           output,
+                                      const Tint1&                            whole_length,
+                                      const size_t                            nbatch,
+                                      const Tint2&                            istride,
+                                      const size_t                            idist,
+                                      const Tint3&                            ostride,
+                                      const size_t                            odist,
+                                      std::vector<std::pair<size_t, size_t>>* linf_failures,
+                                      const double                            linf_cutoff,
+                                      const std::vector<size_t>&              ioffset,
+                                      const std::vector<size_t>&              ooffset,
+                                      const double                            output_scalar = 1.0)
+{
+    double linf = 0.0;
+    double l2   = 0.0;
+
+    std::mutex                             linf_failure_lock;
+    std::vector<std::pair<size_t, size_t>> linf_failures_private;
+
+    const bool idx_equals_odx = istride == ostride && idist == odist;
+    size_t     idx_base       = 0;
+    size_t     odx_base       = 0;
+    auto       partitions     = partition_rowmajor(whole_length);
+    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private)
+#endif
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            double     cur_linf = 0.0;
+            double     cur_l2   = 0.0;
+            auto       index    = partitions[part].first;
+            const auto length   = partitions[part].second;
+            do
+            {
+                const auto   idx = compute_index(index, istride, idx_base);
+                const auto   odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
+                const double diff
+                    = std::abs(static_cast<double>(output[odx + ooffset[0]]) * output_scalar
+                               - static_cast<double>(input[idx + ioffset[0]]));
+                cur_linf = std::max(diff, cur_linf);
+                if(cur_linf > linf_cutoff)
+                {
+                    std::pair<size_t, size_t> fval(b, idx);
+                    if(linf_failures)
+                        linf_failures_private.push_back(fval);
+                }
+                cur_l2 += diff * diff;
+
+            } while(increment_rowmajor(index, length));
+            linf = std::max(linf, cur_linf);
+            l2 += cur_l2;
+
+            if(linf_failures)
+            {
+                linf_failure_lock.lock();
+                std::copy(linf_failures_private.begin(),
+                          linf_failures_private.end(),
+                          std::back_inserter(*linf_failures));
+                linf_failure_lock.unlock();
+            }
+        }
+    }
+    return {.l_2 = sqrt(l2), .l_inf = linf};
+}
+
+// Compute the L-infinity and L-2 distance between two buffers with strides istride and
+// length idist between batches to a buffer with strides ostride and length odist between
+// batches.  input is complex-interleaved, output is complex-planar.
+template <typename Tval, typename Tint1, typename T2, typename T3>
+inline VectorNorms distance_1to2(const rocfft_complex<Tval>*             input,
+                                 const Tval*                             output0,
+                                 const Tval*                             output1,
+                                 const Tint1&                            whole_length,
+                                 const size_t                            nbatch,
+                                 const T2&                               istride,
+                                 const size_t                            idist,
+                                 const T3&                               ostride,
+                                 const size_t                            odist,
+                                 std::vector<std::pair<size_t, size_t>>* linf_failures,
+                                 const double                            linf_cutoff,
+                                 const std::vector<size_t>&              ioffset,
+                                 const std::vector<size_t>&              ooffset,
+                                 const double                            output_scalar = 1.0)
+{
+    double linf = 0.0;
+    double l2   = 0.0;
+
+    std::mutex                             linf_failure_lock;
+    std::vector<std::pair<size_t, size_t>> linf_failures_private;
+
+    const bool idx_equals_odx = istride == ostride && idist == odist;
+    size_t     idx_base       = 0;
+    size_t     odx_base       = 0;
+    auto       partitions     = partition_rowmajor(whole_length);
+    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private)
+#endif
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            double     cur_linf = 0.0;
+            double     cur_l2   = 0.0;
+            auto       index    = partitions[part].first;
+            const auto length   = partitions[part].second;
+            do
+            {
+                const auto   idx = compute_index(index, istride, idx_base);
+                const auto   odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
+                const double rdiff
+                    = std::abs(static_cast<double>(output0[odx + ooffset[0]]) * output_scalar
+                               - static_cast<double>(input[idx + ioffset[0]].real()));
+                cur_linf = std::max(rdiff, cur_linf);
+                if(cur_linf > linf_cutoff)
+                {
+                    std::pair<size_t, size_t> fval(b, idx);
+                    if(linf_failures)
+                        linf_failures_private.push_back(fval);
+                }
+                cur_l2 += rdiff * rdiff;
+
+                const double idiff
+                    = std::abs(static_cast<double>(output1[odx + ooffset[1]]) * output_scalar
+                               - static_cast<double>(input[idx + ioffset[0]].imag()));
+                cur_linf = std::max(idiff, cur_linf);
+                if(cur_linf > linf_cutoff)
+                {
+                    std::pair<size_t, size_t> fval(b, idx);
+                    if(linf_failures)
+                        linf_failures_private.push_back(fval);
+                }
+                cur_l2 += idiff * idiff;
+
+            } while(increment_rowmajor(index, length));
+            linf = std::max(linf, cur_linf);
+            l2 += cur_l2;
+
+            if(linf_failures)
+            {
+                linf_failure_lock.lock();
+                std::copy(linf_failures_private.begin(),
+                          linf_failures_private.end(),
+                          std::back_inserter(*linf_failures));
+                linf_failure_lock.unlock();
+            }
+        }
+    }
+    return {.l_2 = sqrt(l2), .l_inf = linf};
+}
+
+// Compute the L-inifnity and L-2 distance between two buffers of dimension length and
+// with types given by itype, otype, and precision.
+template <typename Tint1, typename Tint2, typename Tint3>
+inline VectorNorms distance(const std::vector<hostbuf>&             input,
+                            const std::vector<hostbuf>&             output,
+                            const Tint1&                            length,
+                            const size_t                            nbatch,
+                            const fft_precision                     precision,
+                            const fft_array_type                    itype,
+                            const Tint2&                            istride,
+                            const size_t                            idist,
+                            const fft_array_type                    otype,
+                            const Tint3&                            ostride,
+                            const size_t                            odist,
+                            std::vector<std::pair<size_t, size_t>>* linf_failures,
+                            const double                            linf_cutoff,
+                            const std::vector<size_t>&              ioffset,
+                            const std::vector<size_t>&              ooffset,
+                            const double                            output_scalar = 1.0)
+{
+    VectorNorms dist;
+
+    if(itype == otype)
+    {
+        switch(itype)
+        {
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_hermitian_interleaved:
+            switch(precision)
+            {
+            case fft_precision_half:
+                dist = distance_1to1_complex(
+                    reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
+                    reinterpret_cast<const rocfft_complex<_Float16>*>(output[0].data()),
+                    length,
+                    nbatch,
+                    istride,
+                    idist,
+                    ostride,
+                    odist,
+                    linf_failures,
+                    linf_cutoff,
+                    ioffset,
+                    ooffset,
+                    output_scalar);
+                break;
+            case fft_precision_single:
+                dist = distance_1to1_complex(
+                    reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
+                    reinterpret_cast<const rocfft_complex<float>*>(output[0].data()),
+                    length,
+                    nbatch,
+                    istride,
+                    idist,
+                    ostride,
+                    odist,
+                    linf_failures,
+                    linf_cutoff,
+                    ioffset,
+                    ooffset,
+                    output_scalar);
+                break;
+            case fft_precision_double:
+                dist = distance_1to1_complex(
+                    reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
+                    reinterpret_cast<const rocfft_complex<double>*>(output[0].data()),
+                    length,
+                    nbatch,
+                    istride,
+                    idist,
+                    ostride,
+                    odist,
+                    linf_failures,
+                    linf_cutoff,
+                    ioffset,
+                    ooffset,
+                    output_scalar);
+                break;
+            }
+            dist.l_2 *= dist.l_2;
+            break;
+        case fft_array_type_real:
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_planar:
+            for(unsigned int idx = 0; idx < input.size(); ++idx)
+            {
+                VectorNorms d;
+                switch(precision)
+                {
+                case fft_precision_half:
+                    d = distance_1to1_real(reinterpret_cast<const _Float16*>(input[idx].data()),
+                                           reinterpret_cast<const _Float16*>(output[idx].data()),
+                                           length,
+                                           nbatch,
+                                           istride,
+                                           idist,
+                                           ostride,
+                                           odist,
+                                           linf_failures,
+                                           linf_cutoff,
+                                           ioffset,
+                                           ooffset,
+                                           output_scalar);
+                    break;
+                case fft_precision_single:
+                    d = distance_1to1_real(reinterpret_cast<const float*>(input[idx].data()),
+                                           reinterpret_cast<const float*>(output[idx].data()),
+                                           length,
+                                           nbatch,
+                                           istride,
+                                           idist,
+                                           ostride,
+                                           odist,
+                                           linf_failures,
+                                           linf_cutoff,
+                                           ioffset,
+                                           ooffset,
+                                           output_scalar);
+                    break;
+                case fft_precision_double:
+                    d = distance_1to1_real(reinterpret_cast<const double*>(input[idx].data()),
+                                           reinterpret_cast<const double*>(output[idx].data()),
+                                           length,
+                                           nbatch,
+                                           istride,
+                                           idist,
+                                           ostride,
+                                           odist,
+                                           linf_failures,
+                                           linf_cutoff,
+                                           ioffset,
+                                           ooffset,
+                                           output_scalar);
+                    break;
+                }
+                dist.l_inf = std::max(d.l_inf, dist.l_inf);
+                dist.l_2 += d.l_2 * d.l_2;
+            }
+            break;
+        default:
+            throw std::runtime_error("Invalid input and output types.");
+        }
+    }
+    else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar)
+            || (itype == fft_array_type_hermitian_interleaved
+                && otype == fft_array_type_hermitian_planar))
+    {
+        switch(precision)
+        {
+        case fft_precision_half:
+            dist = distance_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
+                                 reinterpret_cast<const _Float16*>(output[0].data()),
+                                 reinterpret_cast<const _Float16*>(output[1].data()),
+                                 length,
+                                 nbatch,
+                                 istride,
+                                 idist,
+                                 ostride,
+                                 odist,
+                                 linf_failures,
+                                 linf_cutoff,
+                                 ioffset,
+                                 ooffset,
+                                 output_scalar);
+            break;
+        case fft_precision_single:
+            dist = distance_1to2(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
+                                 reinterpret_cast<const float*>(output[0].data()),
+                                 reinterpret_cast<const float*>(output[1].data()),
+                                 length,
+                                 nbatch,
+                                 istride,
+                                 idist,
+                                 ostride,
+                                 odist,
+                                 linf_failures,
+                                 linf_cutoff,
+                                 ioffset,
+                                 ooffset,
+                                 output_scalar);
+            break;
+        case fft_precision_double:
+            dist = distance_1to2(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
+                                 reinterpret_cast<const double*>(output[0].data()),
+                                 reinterpret_cast<const double*>(output[1].data()),
+                                 length,
+                                 nbatch,
+                                 istride,
+                                 idist,
+                                 ostride,
+                                 odist,
+                                 linf_failures,
+                                 linf_cutoff,
+                                 ioffset,
+                                 ooffset,
+                                 output_scalar);
+            break;
+        }
+        dist.l_2 *= dist.l_2;
+    }
+    else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved)
+            || (itype == fft_array_type_hermitian_planar
+                && otype == fft_array_type_hermitian_interleaved))
+    {
+        switch(precision)
+        {
+        case fft_precision_half:
+            dist
+                = distance_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(output[0].data()),
+                                reinterpret_cast<const _Float16*>(input[0].data()),
+                                reinterpret_cast<const _Float16*>(input[1].data()),
+                                length,
+                                nbatch,
+                                ostride,
+                                odist,
+                                istride,
+                                idist,
+                                linf_failures,
+                                linf_cutoff,
+                                ioffset,
+                                ooffset,
+                                output_scalar);
+            break;
+        case fft_precision_single:
+            dist = distance_1to2(reinterpret_cast<const rocfft_complex<float>*>(output[0].data()),
+                                 reinterpret_cast<const float*>(input[0].data()),
+                                 reinterpret_cast<const float*>(input[1].data()),
+                                 length,
+                                 nbatch,
+                                 ostride,
+                                 odist,
+                                 istride,
+                                 idist,
+                                 linf_failures,
+                                 linf_cutoff,
+                                 ioffset,
+                                 ooffset,
+                                 output_scalar);
+            break;
+        case fft_precision_double:
+            dist = distance_1to2(reinterpret_cast<const rocfft_complex<double>*>(output[0].data()),
+                                 reinterpret_cast<const double*>(input[0].data()),
+                                 reinterpret_cast<const double*>(input[1].data()),
+                                 length,
+                                 nbatch,
+                                 ostride,
+                                 odist,
+                                 istride,
+                                 idist,
+                                 linf_failures,
+                                 linf_cutoff,
+                                 ioffset,
+                                 ooffset,
+                                 output_scalar);
+            break;
+        }
+        dist.l_2 *= dist.l_2;
+    }
+    else
+    {
+        throw std::runtime_error("Invalid input and output types.");
+    }
+    dist.l_2 = sqrt(dist.l_2);
+    return dist;
+}
+
+// check if the specified length + stride/dist is contiguous
+template <typename Tint1, typename Tint2>
+bool is_contiguous_rowmajor(const std::vector<Tint1>& length,
+                            const std::vector<Tint2>& stride,
+                            size_t                    dist)
+{
+    size_t expected_stride = 1;
+    auto   stride_it       = stride.rbegin();
+    auto   length_it       = length.rbegin();
+    for(; stride_it != stride.rend() && length_it != length.rend(); ++stride_it, ++length_it)
+    {
+        if(*stride_it != expected_stride)
+            return false;
+        expected_stride *= *length_it;
+    }
+    return expected_stride == dist;
+}
+
+// Unroll arbitrary-dimension distance into specializations for 1-, 2-, 3-dimensions
+template <typename Tint1, typename Tint2, typename Tint3>
+inline VectorNorms distance(const std::vector<hostbuf>&             input,
+                            const std::vector<hostbuf>&             output,
+                            std::vector<Tint1>                      length,
+                            size_t                                  nbatch,
+                            const fft_precision                     precision,
+                            const fft_array_type                    itype,
+                            std::vector<Tint2>                      istride,
+                            const size_t                            idist,
+                            const fft_array_type                    otype,
+                            std::vector<Tint3>                      ostride,
+                            const size_t                            odist,
+                            std::vector<std::pair<size_t, size_t>>* linf_failures,
+                            const double                            linf_cutoff,
+                            const std::vector<size_t>&              ioffset,
+                            const std::vector<size_t>&              ooffset,
+                            const double                            output_scalar = 1.0)
+{
+    // If istride and ostride are both contiguous, collapse them down
+    // to one dimension.  Index calculation is simpler (and faster)
+    // in the 1D case.
+    if(is_contiguous_rowmajor(length, istride, idist)
+       && is_contiguous_rowmajor(length, ostride, odist))
+    {
+        length  = {product(length.begin(), length.end()) * nbatch};
+        istride = {static_cast<Tint2>(1)};
+        ostride = {static_cast<Tint3>(1)};
+        nbatch  = 1;
+    }
+
+    switch(length.size())
+    {
+    case 1:
+        return distance(input,
+                        output,
+                        length[0],
+                        nbatch,
+                        precision,
+                        itype,
+                        istride[0],
+                        idist,
+                        otype,
+                        ostride[0],
+                        odist,
+                        linf_failures,
+                        linf_cutoff,
+                        ioffset,
+                        ooffset,
+                        output_scalar);
+    case 2:
+        return distance(input,
+                        output,
+                        std::make_tuple(length[0], length[1]),
+                        nbatch,
+                        precision,
+                        itype,
+                        std::make_tuple(istride[0], istride[1]),
+                        idist,
+                        otype,
+                        std::make_tuple(ostride[0], ostride[1]),
+                        odist,
+                        linf_failures,
+                        linf_cutoff,
+                        ioffset,
+                        ooffset,
+                        output_scalar);
+    case 3:
+        return distance(input,
+                        output,
+                        std::make_tuple(length[0], length[1], length[2]),
+                        nbatch,
+                        precision,
+                        itype,
+                        std::make_tuple(istride[0], istride[1], istride[2]),
+                        idist,
+                        otype,
+                        std::make_tuple(ostride[0], ostride[1], ostride[2]),
+                        odist,
+                        linf_failures,
+                        linf_cutoff,
+                        ioffset,
+                        ooffset,
+                        output_scalar);
+    default:
+        abort();
+    }
+}
+
+// Compute the L-infinity and L-2 norm of a buffer with strides istride and
+// length idist.  Data is rocfft_complex.
+template <typename Tcomplex, typename T1, typename T2>
+inline VectorNorms norm_complex(const Tcomplex*            input,
+                                const T1&                  whole_length,
+                                const size_t               nbatch,
+                                const T2&                  istride,
+                                const size_t               idist,
+                                const std::vector<size_t>& offset)
+{
+    double linf = 0.0;
+    double l2   = 0.0;
+
+    size_t idx_base   = 0;
+    auto   partitions = partition_rowmajor(whole_length);
+    for(size_t b = 0; b < nbatch; b++, idx_base += idist)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size())
+#endif
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            double     cur_linf = 0.0;
+            double     cur_l2   = 0.0;
+            auto       index    = partitions[part].first;
+            const auto length   = partitions[part].second;
+            do
+            {
+                const auto idx = compute_index(index, istride, idx_base);
+
+                const double rval = std::abs(static_cast<double>(input[idx + offset[0]].real()));
+                cur_linf          = std::max(rval, cur_linf);
+                cur_l2 += rval * rval;
+
+                const double ival = std::abs(static_cast<double>(input[idx + offset[0]].imag()));
+                cur_linf          = std::max(ival, cur_linf);
+                cur_l2 += ival * ival;
+
+            } while(increment_rowmajor(index, length));
+            linf = std::max(linf, cur_linf);
+            l2 += cur_l2;
+        }
+    }
+    return {.l_2 = sqrt(l2), .l_inf = linf};
+}
+
+// Compute the L-infinity and L-2 norm of abuffer with strides istride and
+// length idist.  Data is real-valued.
+template <typename Tfloat, typename T1, typename T2>
+inline VectorNorms norm_real(const Tfloat*              input,
+                             const T1&                  whole_length,
+                             const size_t               nbatch,
+                             const T2&                  istride,
+                             const size_t               idist,
+                             const std::vector<size_t>& offset)
+{
+    double linf = 0.0;
+    double l2   = 0.0;
+
+    size_t idx_base   = 0;
+    auto   partitions = partition_rowmajor(whole_length);
+    for(size_t b = 0; b < nbatch; b++, idx_base += idist)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size())
+#endif
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            double     cur_linf = 0.0;
+            double     cur_l2   = 0.0;
+            auto       index    = partitions[part].first;
+            const auto length   = partitions[part].second;
+            do
+            {
+                const auto   idx = compute_index(index, istride, idx_base);
+                const double val = std::abs(static_cast<double>(input[idx + offset[0]]));
+                cur_linf         = std::max(val, cur_linf);
+                cur_l2 += val * val;
+
+            } while(increment_rowmajor(index, length));
+            linf = std::max(linf, cur_linf);
+            l2 += cur_l2;
+        }
+    }
+    return {.l_2 = sqrt(l2), .l_inf = linf};
+}
+
+// Compute the L-infinity and L-2 norm of abuffer with strides istride and
+// length idist.  Data format is given by precision and itype.
+template <typename T1, typename T2>
+inline VectorNorms norm(const std::vector<hostbuf>& input,
+                        const T1&                   length,
+                        const size_t                nbatch,
+                        const fft_precision         precision,
+                        const fft_array_type        itype,
+                        const T2&                   istride,
+                        const size_t                idist,
+                        const std::vector<size_t>&  offset)
+{
+    VectorNorms norm;
+
+    switch(itype)
+    {
+    case fft_array_type_complex_interleaved:
+    case fft_array_type_hermitian_interleaved:
+        switch(precision)
+        {
+        case fft_precision_half:
+            norm = norm_complex(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
+                                length,
+                                nbatch,
+                                istride,
+                                idist,
+                                offset);
+            break;
+        case fft_precision_single:
+            norm = norm_complex(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
+                                length,
+                                nbatch,
+                                istride,
+                                idist,
+                                offset);
+            break;
+        case fft_precision_double:
+            norm = norm_complex(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
+                                length,
+                                nbatch,
+                                istride,
+                                idist,
+                                offset);
+            break;
+        }
+        norm.l_2 *= norm.l_2;
+        break;
+    case fft_array_type_real:
+    case fft_array_type_complex_planar:
+    case fft_array_type_hermitian_planar:
+        for(unsigned int idx = 0; idx < input.size(); ++idx)
+        {
+            VectorNorms n;
+            switch(precision)
+            {
+            case fft_precision_half:
+                n = norm_real(reinterpret_cast<const _Float16*>(input[idx].data()),
+                              length,
+                              nbatch,
+                              istride,
+                              idist,
+                              offset);
+                break;
+            case fft_precision_single:
+                n = norm_real(reinterpret_cast<const float*>(input[idx].data()),
+                              length,
+                              nbatch,
+                              istride,
+                              idist,
+                              offset);
+                break;
+            case fft_precision_double:
+                n = norm_real(reinterpret_cast<const double*>(input[idx].data()),
+                              length,
+                              nbatch,
+                              istride,
+                              idist,
+                              offset);
+                break;
+            }
+            norm.l_inf = std::max(n.l_inf, norm.l_inf);
+            norm.l_2 += n.l_2 * n.l_2;
+        }
+        break;
+    default:
+        throw std::runtime_error("Invalid data type");
+    }
+
+    norm.l_2 = sqrt(norm.l_2);
+    return norm;
+}
+
+// Unroll arbitrary-dimension norm into specializations for 1-, 2-, 3-dimensions
+template <typename T1, typename T2>
+inline VectorNorms norm(const std::vector<hostbuf>& input,
+                        std::vector<T1>             length,
+                        size_t                      nbatch,
+                        const fft_precision         precision,
+                        const fft_array_type        type,
+                        std::vector<T2>             stride,
+                        const size_t                dist,
+                        const std::vector<size_t>&  offset)
+{
+    // If stride is contiguous, collapse it down to one dimension.
+    // Index calculation is simpler (and faster) in the 1D case.
+    if(is_contiguous_rowmajor(length, stride, dist))
+    {
+        length = {product(length.begin(), length.end()) * nbatch};
+        stride = {static_cast<T2>(1)};
+        nbatch = 1;
+    }
+
+    switch(length.size())
+    {
+    case 1:
+        return norm(input, length[0], nbatch, precision, type, stride[0], dist, offset);
+    case 2:
+        return norm(input,
+                    std::make_tuple(length[0], length[1]),
+                    nbatch,
+                    precision,
+                    type,
+                    std::make_tuple(stride[0], stride[1]),
+                    dist,
+                    offset);
+    case 3:
+        return norm(input,
+                    std::make_tuple(length[0], length[1], length[2]),
+                    nbatch,
+                    precision,
+                    type,
+                    std::make_tuple(stride[0], stride[1], stride[2]),
+                    dist,
+                    offset);
+    default:
+        abort();
+    }
+}
+
+// Given a data type and precision, the distance between batches, and
+// the batch size, allocate the required host buffer(s).
+static std::vector<hostbuf> allocate_host_buffer(const fft_precision        precision,
+                                                 const fft_array_type       type,
+                                                 const std::vector<size_t>& size)
+{
+    std::vector<hostbuf> buffers(size.size());
+    for(unsigned int i = 0; i < size.size(); ++i)
+    {
+        buffers[i].alloc(size[i] * var_size<size_t>(precision, type));
+    }
+    return buffers;
+}
+
+// Check if the required buffers fit in the device vram.
+inline bool vram_fits_problem(const size_t prob_size, const size_t vram_avail, int deviceId = 0)
+{
+    // We keep a small margin of error for fitting the problem into vram:
+    const size_t extra = 1 << 27;
+
+    return vram_avail > prob_size + extra;
+}
+
+// Computes the twiddle table VRAM footprint for r2c/c2r transforms.
+// This function will return 0 for the other transform types, since
+// the VRAM footprint in rocFFT is negligible for the other cases.
+inline size_t twiddle_table_vram_footprint(const fft_params& params)
+{
+    size_t vram_footprint = 0;
+
+    // Add vram footprint from real/complex even twiddle buffer size.
+    if(params.transform_type == fft_transform_type_real_forward
+       || params.transform_type == fft_transform_type_real_inverse)
+    {
+        const auto realdim = params.length.back();
+        if(realdim % 2 == 0)
+        {
+            const auto complex_size = params.precision == fft_precision_single ? 8 : 16;
+            // even length twiddle size is 1/4 of the real size, but
+            // in complex elements
+            vram_footprint += realdim * complex_size / 4;
+        }
+    }
+
+    return vram_footprint;
+}
+
+#endif
diff -Nru rocfft-5.5.0/shared/hipstream_wrapper.h rocfft-5.7.1/shared/hipstream_wrapper.h
--- rocfft-5.5.0/shared/hipstream_wrapper.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/shared/hipstream_wrapper.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,65 @@
+/******************************************************************************
+* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*******************************************************************************/
+
+#ifndef ROCFFT_HIPSTREAM_WRAPPER_H
+#define ROCFFT_HIPSTREAM_WRAPPER_H
+
+#include "rocfft_hip.h"
+
+// RAII wrapper around hipStream_t
+struct hipStream_wrapper_t
+{
+    hipStream_wrapper_t()
+        : stream(nullptr)
+    {
+    }
+
+    void alloc()
+    {
+        if(stream == nullptr && hipStreamCreate(&stream) != hipSuccess)
+            throw std::runtime_error("hipStreamCreate failure");
+    }
+
+    operator hipStream_t()
+    {
+        return stream;
+    }
+
+    ~hipStream_wrapper_t()
+    {
+        if(stream)
+            (void)hipStreamDestroy(stream);
+    }
+
+    hipStream_wrapper_t(const hipStream_wrapper_t&) = delete;
+    hipStream_wrapper_t& operator=(const hipStream_wrapper_t&) = delete;
+    hipStream_wrapper_t(hipStream_wrapper_t&& other)
+        : stream(other.stream)
+    {
+        other.stream = nullptr;
+    }
+
+private:
+    hipStream_t stream;
+};
+
+#endif // ROCFFT_HIPSTREAM_WRAPPER_H
\ No newline at end of file
diff -Nru rocfft-5.5.0/shared/hostbuf.h rocfft-5.7.1/shared/hostbuf.h
--- rocfft-5.5.0/shared/hostbuf.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/shared/hostbuf.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,150 @@
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_HOSTBUF_H
+#define ROCFFT_HOSTBUF_H
+
+#include "arithmetic.h"
+#include <cstdlib>
+
+#ifndef WIN32
+#include <stdlib.h>
+#include <sys/mman.h>
+#endif
+
+// Simple RAII class for host buffers.  T is the type of pointer that
+// data() returns
+template <class T = void>
+class hostbuf_t
+{
+public:
+    hostbuf_t() {}
+    // buffers are movable but not copyable
+    hostbuf_t(hostbuf_t&& other)
+    {
+        std::swap(buf, other.buf);
+        std::swap(bsize, other.bsize);
+    }
+    hostbuf_t& operator=(hostbuf_t&& other)
+    {
+        std::swap(buf, other.buf);
+        std::swap(bsize, other.bsize);
+        return *this;
+    }
+    hostbuf_t(const hostbuf_t&) = delete;
+    hostbuf_t& operator=(const hostbuf_t&) = delete;
+
+    ~hostbuf_t()
+    {
+        free();
+    }
+
+    void alloc(const size_t size)
+    {
+        bsize = size;
+        free();
+
+        // FFTW requires aligned allocations to use faster SIMD instructions.
+        // If enabling hugepages, align to 2 MiB. Otherwise, aligning to
+        // 64 bytes is enough for AVX instructions up to AVX512.
+#ifdef WIN32
+        buf = _aligned_malloc(size, 64);
+#else
+        // On Linux, ask for hugepages to reduce TLB pressure and
+        // improve performance.  Allocations need to be aligned to
+        // the hugepage size, and rounded up to the next whole
+        // hugepage.
+        static const size_t TWO_MiB = 2 * 1024 * 1024;
+        if(size >= TWO_MiB)
+        {
+            size_t rounded_size = DivRoundingUp(size, TWO_MiB) * TWO_MiB;
+            buf                 = aligned_alloc(TWO_MiB, rounded_size);
+            madvise(buf, rounded_size, MADV_HUGEPAGE);
+        }
+        else
+            buf = aligned_alloc(64, size);
+#endif
+    }
+
+    size_t size() const
+    {
+        return bsize;
+    }
+
+    void free()
+    {
+        if(buf != nullptr)
+        {
+#ifdef WIN32
+            _aligned_free(buf);
+#else
+            std::free(buf);
+#endif
+            buf   = nullptr;
+            bsize = 0;
+        }
+    }
+
+    T* data() const
+    {
+        return static_cast<T*>(buf);
+    }
+
+    // Copy method
+    hostbuf_t copy() const
+    {
+        hostbuf_t copy;
+        copy.alloc(bsize);
+        memcpy(copy.buf, buf, bsize);
+        return copy;
+    }
+
+    // shrink the buffer to fit the new size
+    void shrink(size_t new_size)
+    {
+        if(new_size > bsize)
+            throw std::runtime_error("can't shrink hostbuf to larger size");
+        // just pretend the buffer is now that size
+        bsize = new_size;
+    }
+
+    // equality/bool tests
+    bool operator==(std::nullptr_t n) const
+    {
+        return buf == n;
+    }
+    bool operator!=(std::nullptr_t n) const
+    {
+        return buf != n;
+    }
+    operator bool() const
+    {
+        return buf;
+    }
+
+private:
+    // The host buffer
+    void*  buf   = nullptr;
+    size_t bsize = 0;
+};
+
+// default hostbuf that gives out void* pointers
+typedef hostbuf_t<> hostbuf;
+#endif
diff -Nru rocfft-5.5.0/shared/precision_type.h rocfft-5.7.1/shared/precision_type.h
--- rocfft-5.5.0/shared/precision_type.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/shared/precision_type.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,56 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_PRECISION_TYPE_H
+#define ROCFFT_PRECISION_TYPE_H
+
+#include "rocfft.h"
+
+static size_t real_type_size(rocfft_precision precision)
+{
+    switch(precision)
+    {
+    case rocfft_precision_half:
+        return 2;
+    case rocfft_precision_single:
+        return 4;
+    case rocfft_precision_double:
+        return 8;
+    }
+}
+
+static size_t complex_type_size(rocfft_precision precision)
+{
+    return real_type_size(precision) * 2;
+}
+
+static const char* precision_name(rocfft_precision precision)
+{
+    switch(precision)
+    {
+    case rocfft_precision_half:
+        return "half";
+    case rocfft_precision_single:
+        return "single";
+    case rocfft_precision_double:
+        return "double";
+    }
+}
+#endif
diff -Nru rocfft-5.5.0/shared/printbuffer.h rocfft-5.7.1/shared/printbuffer.h
--- rocfft-5.5.0/shared/printbuffer.h	2023-01-31 06:20:16.000000000 +0000
+++ rocfft-5.7.1/shared/printbuffer.h	2023-08-09 16:19:51.000000000 +0000
@@ -1,4 +1,4 @@
-// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -21,6 +21,7 @@
 #ifndef PRINTBUFFER_H
 #define PRINTBUFFER_H
 
+#include "hostbuf.h"
 #include "increment.h"
 #include <algorithm>
 #include <vector>
@@ -62,69 +63,40 @@
     }
 }
 
-// Partial template specializations for printing different buffer types.
-template <typename Toutput>
+template <typename Telem>
 class buffer_printer
 {
-public:
-    template <typename Tallocator,
-              typename Tint1,
-              typename Tint2,
-              typename Tsize,
-              typename Tstream = std::ostream>
-    void print_buffer(const std::vector<std::vector<char, Tallocator>>& buf,
-                      const std::vector<Tint1>&                         length,
-                      const std::vector<Tint2>&                         stride,
-                      const Tsize                                       nbatch,
-                      const Tsize                                       dist,
-                      const std::vector<size_t>&                        offset,
-                      Tstream&                                          stream = std::cout)
-    {
-        throw std::runtime_error("base class for buffer_printer print_buffer not implemented.");
-    };
-    template <typename Tallocator, typename Tstream = std::ostream>
-    void print_buffer_flat(const std::vector<std::vector<char, Tallocator>>& buf,
-                           const std::vector<size_t>&                        size,
-                           const std::vector<size_t>&                        offset)
-    {
-        throw std::runtime_error(
-            "base class for buffer_printer print_buffer_flat not implemented.");
-    };
-};
-
-template <>
-class buffer_printer<float>
-{
     // The scalar versions might be part of a planar format.
 public:
-    template <typename Tallocator,
-              typename Tint1,
-              typename Tint2,
-              typename Tsize,
-              typename Tstream = std::ostream>
-    static void print_buffer(const std::vector<std::vector<char, Tallocator>>& buf,
-                             const std::vector<Tint1>&                         length,
-                             const std::vector<Tint2>&                         stride,
-                             const Tsize                                       nbatch,
-                             const Tsize                                       dist,
-                             const std::vector<size_t>&                        offset,
-                             Tstream&                                          stream = std::cout)
+    template <typename Tint1, typename Tint2, typename Tsize, typename Tstream = std::ostream>
+    static void print_buffer(const std::vector<hostbuf>& buf,
+                             const std::vector<Tint1>&   length,
+                             const std::vector<Tint2>&   stride,
+                             const Tsize                 nbatch,
+                             const Tsize                 dist,
+                             const std::vector<size_t>&  offset,
+                             Tstream&                    stream = std::cout)
     {
         for(const auto& vec : buf)
         {
-            printbuffer(
-                (const float*)(vec.data()), length, stride, nbatch, dist, offset[0], stream);
+            printbuffer(reinterpret_cast<const Telem*>(vec.data()),
+                        length,
+                        stride,
+                        nbatch,
+                        dist,
+                        offset[0],
+                        stream);
         }
     };
-    template <typename Tallocator, typename Tstream = std::ostream>
-    static void print_buffer_flat(const std::vector<std::vector<char, Tallocator>>& buf,
-                                  const std::vector<size_t>&                        size,
-                                  const std::vector<size_t>&                        offset,
-                                  Tstream& stream = std::cout)
+    template <typename Tstream = std::ostream>
+    static void print_buffer_flat(const std::vector<hostbuf>& buf,
+                                  const std::vector<size_t>&  size,
+                                  const std::vector<size_t>&  offset,
+                                  Tstream&                    stream = std::cout)
     {
         for(const auto& vec : buf)
         {
-            auto data = reinterpret_cast<const float*>(vec.data());
+            auto data = reinterpret_cast<const Telem*>(vec.data());
             stream << "idx " << 0;
             for(size_t i = 0; i < size[0]; ++i)
                 stream << " " << data[i];
@@ -133,121 +105,4 @@
     };
 };
 
-template <>
-class buffer_printer<double>
-{
-    // The scalar versions might be part of a planar format.
-public:
-    template <typename Tallocator,
-              typename Tint1,
-              typename Tint2,
-              typename Tsize,
-              typename Tstream = std::ostream>
-    static void print_buffer(const std::vector<std::vector<char, Tallocator>>& buf,
-                             const std::vector<Tint1>&                         length,
-                             const std::vector<Tint2>&                         stride,
-                             const Tsize                                       nbatch,
-                             const Tsize                                       dist,
-                             const std::vector<size_t>&                        offset,
-                             Tstream&                                          stream = std::cout)
-    {
-        for(const auto& vec : buf)
-        {
-            printbuffer(
-                (const double*)(vec.data()), length, stride, nbatch, dist, offset[0], stream);
-        }
-    };
-    template <typename Tallocator, typename Tstream = std::ostream>
-    static void print_buffer_flat(const std::vector<std::vector<char, Tallocator>>& buf,
-                                  const std::vector<size_t>&                        size,
-                                  const std::vector<size_t>&                        offset,
-                                  Tstream& stream = std::cout)
-    {
-        for(const auto& vec : buf)
-        {
-            auto data = reinterpret_cast<const double*>(vec.data());
-            stream << "idx " << 0;
-            for(size_t i = 0; i < size[0]; ++i)
-                stream << " " << data[i];
-            stream << std::endl;
-        }
-    };
-};
-
-template <>
-class buffer_printer<std::complex<float>>
-{
-public:
-    template <typename Tallocator,
-              typename Tint1,
-              typename Tint2,
-              typename Tsize,
-              typename Tstream = std::ostream>
-    static void print_buffer(const std::vector<std::vector<char, Tallocator>>& buf,
-                             const std::vector<Tint1>&                         length,
-                             const std::vector<Tint2>&                         stride,
-                             const Tsize                                       nbatch,
-                             const Tsize                                       dist,
-                             const std::vector<size_t>&                        offset,
-                             Tstream&                                          stream = std::cout)
-    {
-        printbuffer((const std::complex<float>*)(buf[0].data()),
-                    length,
-                    stride,
-                    nbatch,
-                    dist,
-                    offset[0],
-                    stream);
-    };
-    template <typename Tallocator, typename Tstream = std::ostream>
-    static void print_buffer_flat(const std::vector<std::vector<char, Tallocator>>& buf,
-                                  const std::vector<size_t>&                        size,
-                                  const std::vector<size_t>&                        offset,
-                                  Tstream& stream = std::cout)
-    {
-        auto data = reinterpret_cast<const std::complex<float>*>(buf[0].data());
-        for(size_t i = 0; i < size[0]; ++i)
-            stream << " " << data[i];
-        stream << std::endl;
-    };
-};
-
-template <>
-class buffer_printer<std::complex<double>>
-{
-public:
-    template <typename Tallocator,
-              typename Tint1,
-              typename Tint2,
-              typename Tsize,
-              typename Tstream = std::ostream>
-    static void print_buffer(const std::vector<std::vector<char, Tallocator>>& buf,
-                             const std::vector<Tint1>&                         length,
-                             const std::vector<Tint2>&                         stride,
-                             const Tsize                                       nbatch,
-                             const Tsize                                       dist,
-                             const std::vector<size_t>&                        offset,
-                             Tstream&                                          stream = std::cout)
-    {
-        printbuffer((const std::complex<double>*)(buf[0].data()),
-                    length,
-                    stride,
-                    nbatch,
-                    dist,
-                    offset[0],
-                    stream);
-    };
-    template <typename Tallocator, typename Tstream = std::ostream>
-    static void print_buffer_flat(const std::vector<std::vector<char, Tallocator>>& buf,
-                                  const std::vector<size_t>&                        size,
-                                  const std::vector<size_t>&                        offset,
-                                  Tstream& stream = std::cout)
-    {
-        auto data = reinterpret_cast<const std::complex<double>*>(buf[0].data());
-        for(size_t i = 0; i < size[0]; ++i)
-            stream << " " << data[i];
-        stream << std::endl;
-    };
-};
-
 #endif
diff -Nru rocfft-5.5.0/shared/rocfft_complex.h rocfft-5.7.1/shared/rocfft_complex.h
--- rocfft-5.5.0/shared/rocfft_complex.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/shared/rocfft_complex.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,346 @@
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_COMPLEX_H
+#define ROCFFT_COMPLEX_H
+
+#include <hip/hip_fp16.h>
+#if !defined(__HIPCC_RTC__)
+#include <iostream>
+#endif
+#include <math.h>
+#include <type_traits>
+
+#ifdef __HIP_PLATFORM_NVIDIA__
+typedef __half _Float16;
+#endif
+
+template <typename Treal>
+struct rocfft_complex
+{
+
+    Treal x; // Real part
+    Treal y; // Imaginary part
+
+    // Constructors
+    // Do not initialize the members x or y by default, to ensure that it can
+    // be used in __shared__ and that it is a trivial class compatible with C.
+    __device__ __host__ rocfft_complex()                      = default;
+    __device__ __host__ rocfft_complex(const rocfft_complex&) = default;
+    __device__ __host__ rocfft_complex(rocfft_complex&&)      = default;
+    __device__ __host__ rocfft_complex& operator=(const rocfft_complex& rhs) & = default;
+    __device__ __host__ rocfft_complex& operator=(rocfft_complex&& rhs) & = default;
+    __device__                          __host__ ~rocfft_complex()        = default;
+
+    // Constructor from real and imaginary parts
+    __device__ __host__ constexpr rocfft_complex(Treal real, Treal imag)
+        : x{real}
+        , y{imag}
+    {
+    }
+
+    // Conversion from different precision
+    template <typename U>
+    __device__ __host__ explicit constexpr rocfft_complex(const rocfft_complex<U>& z)
+        : x(z.x)
+        , y(z.y)
+    {
+    }
+
+    // Accessors
+    __device__ __host__ constexpr Treal real() const
+    {
+        return x;
+    }
+
+    __device__ __host__ constexpr Treal imag() const
+    {
+        return y;
+    }
+
+    // Unary operations
+    __forceinline__ __device__ __host__ rocfft_complex operator-() const
+    {
+        return {-x, -y};
+    }
+
+    __forceinline__ __device__ __host__ rocfft_complex operator+() const
+    {
+        return *this;
+    }
+
+    __device__ __host__ Treal asum(const rocfft_complex& z)
+    {
+        return abs(z.x) + abs(z.y);
+    }
+
+    // Internal real functions
+    static __forceinline__ __device__ __host__ Treal abs(Treal x)
+    {
+        return x < 0 ? -x : x;
+    }
+
+    static __forceinline__ __device__ __host__ float sqrt(float x)
+    {
+        return ::sqrtf(x);
+    }
+
+    static __forceinline__ __device__ __host__ double sqrt(double x)
+    {
+        return ::sqrt(x);
+    }
+
+    // Addition operators
+    __device__ __host__ auto& operator+=(const rocfft_complex& rhs)
+    {
+        return *this = {x + rhs.x, y + rhs.y};
+    }
+
+    __device__ __host__ auto operator+(const rocfft_complex& rhs) const
+    {
+        auto lhs = *this;
+        return lhs += rhs;
+    }
+
+    // Subtraction operators
+    __device__ __host__ auto& operator-=(const rocfft_complex& rhs)
+    {
+        return *this = {x - rhs.x, y - rhs.y};
+    }
+
+    __device__ __host__ auto operator-(const rocfft_complex& rhs) const
+    {
+        auto lhs = *this;
+        return lhs -= rhs;
+    }
+
+    // Multiplication operators
+    __device__ __host__ auto& operator*=(const rocfft_complex& rhs)
+    {
+        return *this = {x * rhs.x - y * rhs.y, y * rhs.x + x * rhs.y};
+    }
+
+    __device__ __host__ auto operator*(const rocfft_complex& rhs) const
+    {
+        auto lhs = *this;
+        return lhs *= rhs;
+    }
+
+    // Division operators
+    __device__ __host__ auto& operator/=(const rocfft_complex& rhs)
+    {
+        // Form of Robert L. Smith's Algorithm 116
+        if(abs(rhs.x) > abs(rhs.y))
+        {
+            Treal ratio = rhs.y / rhs.x;
+            Treal scale = 1 / (rhs.x + rhs.y * ratio);
+            *this       = {(x + y * ratio) * scale, (y - x * ratio) * scale};
+        }
+        else
+        {
+            Treal ratio = rhs.x / rhs.y;
+            Treal scale = 1 / (rhs.x * ratio + rhs.y);
+            *this       = {(y + x * ratio) * scale, (y * ratio - x) * scale};
+        }
+        return *this;
+    }
+
+    __device__ __host__ auto operator/(const rocfft_complex& rhs) const
+    {
+        auto lhs = *this;
+        return lhs /= rhs;
+    }
+
+    // Comparison operators
+    __device__ __host__ constexpr bool operator==(const rocfft_complex& rhs) const
+    {
+        return x == rhs.x && y == rhs.y;
+    }
+
+    __device__ __host__ constexpr bool operator!=(const rocfft_complex& rhs) const
+    {
+        return !(*this == rhs);
+    }
+
+    // Operators for complex-real computations
+    template <typename U>
+    __device__ __host__ auto& operator+=(const U& rhs)
+    {
+        return (x += Treal(rhs)), *this;
+    }
+
+    template <typename U>
+    __device__ __host__ auto& operator-=(const U& rhs)
+    {
+        return (x -= Treal(rhs)), *this;
+    }
+
+    __device__ __host__ auto operator+(const Treal& rhs)
+    {
+        auto lhs = *this;
+        return lhs += rhs;
+    }
+
+    __device__ __host__ auto operator-(const Treal& rhs)
+    {
+        auto lhs = *this;
+        return lhs -= rhs;
+    }
+
+    template <typename U>
+    __device__ __host__ auto& operator*=(const U& rhs)
+    {
+        return (x *= Treal(rhs)), (y *= Treal(rhs)), *this;
+    }
+
+    template <typename U>
+    __device__ __host__ auto operator*(const U& rhs) const
+    {
+        auto lhs = *this;
+        return lhs *= Treal(rhs);
+    }
+
+    template <typename U>
+    __device__ __host__ auto& operator/=(const U& rhs)
+    {
+        return (x /= Treal(rhs)), (y /= Treal(rhs)), *this;
+    }
+
+    template <typename U>
+    __device__ __host__ auto operator/(const U& rhs) const
+    {
+        auto lhs = *this;
+        return lhs /= Treal(rhs);
+    }
+
+    template <typename U>
+    __device__ __host__ constexpr bool operator==(const U& rhs) const
+    {
+        return x == Treal(rhs) && y == 0;
+    }
+
+    template <typename U>
+    __device__ __host__ constexpr bool operator!=(const U& rhs) const
+    {
+        return !(*this == rhs);
+    }
+};
+
+// Stream operators
+#if !defined(__HIPCC_RTC__)
+static std::ostream& operator<<(std::ostream& stream, const _Float16& f)
+{
+    return stream << static_cast<double>(f);
+}
+
+template <typename Treal>
+std::ostream& operator<<(std::ostream& out, const rocfft_complex<Treal>& z)
+{
+    return out << '(' << static_cast<double>(z.x) << ',' << static_cast<double>(z.y) << ')';
+}
+#endif
+
+// Operators for real-complex computations
+template <typename U, typename Treal>
+__device__ __host__ rocfft_complex<Treal> operator+(const U& lhs, const rocfft_complex<Treal>& rhs)
+{
+    return {Treal(lhs) + rhs.x, rhs.y};
+}
+
+template <typename U, typename Treal>
+__device__ __host__ rocfft_complex<Treal> operator-(const U& lhs, const rocfft_complex<Treal>& rhs)
+{
+    return {Treal(lhs) - rhs.x, -rhs.y};
+}
+
+template <typename U, typename Treal>
+__device__ __host__ rocfft_complex<Treal> operator*(const U& lhs, const rocfft_complex<Treal>& rhs)
+{
+    return {Treal(lhs) * rhs.x, Treal(lhs) * rhs.y};
+}
+
+template <typename U, typename Treal>
+__device__ __host__ rocfft_complex<Treal> operator/(const U& lhs, const rocfft_complex<Treal>& rhs)
+{
+    // Form of Robert L. Smith's Algorithm 116
+    if(rocfft_complex<Treal>::abs(rhs.x) > rocfft_complex<Treal>::abs(rhs.y))
+    {
+        Treal ratio = rhs.y / rhs.x;
+        Treal scale = Treal(lhs) / (rhs.x + rhs.y * ratio);
+        return {scale, -scale * ratio};
+    }
+    else
+    {
+        Treal ratio = rhs.x / rhs.y;
+        Treal scale = Treal(lhs) / (rhs.x * ratio + rhs.y);
+        return {ratio * scale, -scale};
+    }
+}
+
+template <typename U, typename Treal>
+__device__ __host__ constexpr bool operator==(const U& lhs, const rocfft_complex<Treal>& rhs)
+{
+    return Treal(lhs) == rhs.x && 0 == rhs.y;
+}
+
+template <typename U, typename Treal>
+__device__ __host__ constexpr bool operator!=(const U& lhs, const rocfft_complex<Treal>& rhs)
+{
+    return !(lhs == rhs);
+}
+
+// Extending std namespace to handle rocfft_complex datatype
+namespace std
+{
+    template <typename Treal>
+    __device__ __host__ constexpr Treal real(const rocfft_complex<Treal>& z)
+    {
+        return z.x;
+    }
+
+    template <typename Treal>
+    __device__ __host__ constexpr Treal imag(const rocfft_complex<Treal>& z)
+    {
+        return z.y;
+    }
+
+    template <typename Treal>
+    __device__ __host__ constexpr rocfft_complex<Treal> conj(const rocfft_complex<Treal>& z)
+    {
+        return {z.x, -z.y};
+    }
+
+    template <typename Treal>
+    __device__ __host__ inline Treal norm(const rocfft_complex<Treal>& z)
+    {
+        return (z.x * z.x) + (z.y * z.y);
+    }
+
+    template <typename Treal>
+    __device__ __host__ inline Treal abs(const rocfft_complex<Treal>& z)
+    {
+        Treal tr = rocfft_complex<Treal>::abs(z.x), ti = rocfft_complex<Treal>::abs(z.y);
+        return tr > ti ? (ti /= tr, tr * rocfft_complex<Treal>::sqrt(ti * ti + 1))
+               : ti    ? (tr /= ti, ti * rocfft_complex<Treal>::sqrt(tr * tr + 1))
+                       : 0;
+    }
+}
+
+#endif // ROCFFT_COMPLEX_H
diff -Nru rocfft-5.5.0/shared/rocfft_hip.h rocfft-5.7.1/shared/rocfft_hip.h
--- rocfft-5.5.0/shared/rocfft_hip.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/shared/rocfft_hip.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,26 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef __ROCFFT_HIP_H__
+#define __ROCFFT_HIP_H__
+
+#include <hip/hip_runtime_api.h>
+
+#endif // __ROCFFT_HIP_H__
diff -Nru rocfft-5.5.0/shared/rocfft_params.h rocfft-5.7.1/shared/rocfft_params.h
--- rocfft-5.5.0/shared/rocfft_params.h	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/shared/rocfft_params.h	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,344 @@
+// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_PARAMS_H
+#define ROCFFT_PARAMS_H
+
+#include "../shared/fft_params.h"
+#include "../shared/gpubuf.h"
+#include "rocfft.h"
+
+// Return the string of the rocfft_status code
+static std::string rocfft_status_to_string(const rocfft_status ret)
+{
+    switch(ret)
+    {
+    case rocfft_status_success:
+        return "rocfft_status_success";
+    case rocfft_status_failure:
+        return "rocfft_status_failure";
+    case rocfft_status_invalid_arg_value:
+        return "rocfft_status_invalid_arg_value";
+    case rocfft_status_invalid_dimensions:
+        return "rocfft_status_invalid_dimensions";
+    case rocfft_status_invalid_array_type:
+        return "rocfft_status_invalid_array_type";
+    case rocfft_status_invalid_strides:
+        return "rocfft_status_invalid_strides";
+    case rocfft_status_invalid_distance:
+        return "rocfft_status_invalid_distance";
+    case rocfft_status_invalid_offset:
+        return "rocfft_status_invalid_offset";
+    case rocfft_status_invalid_work_buffer:
+        return "rocfft_status_invalid_work_buffer";
+    default:
+        throw std::runtime_error("unknown rocfft_status");
+    }
+}
+
+inline fft_status fft_status_from_rocfftparams(const rocfft_status val)
+{
+    switch(val)
+    {
+    case rocfft_status_success:
+        return fft_status_success;
+    case rocfft_status_failure:
+        return fft_status_failure;
+    case rocfft_status_invalid_arg_value:
+        return fft_status_invalid_arg_value;
+    case rocfft_status_invalid_dimensions:
+        return fft_status_invalid_dimensions;
+    case rocfft_status_invalid_array_type:
+        return fft_status_invalid_array_type;
+    case rocfft_status_invalid_strides:
+        return fft_status_invalid_strides;
+    case rocfft_status_invalid_distance:
+        return fft_status_invalid_distance;
+    case rocfft_status_invalid_offset:
+        return fft_status_invalid_offset;
+    case rocfft_status_invalid_work_buffer:
+        return fft_status_invalid_work_buffer;
+    default:
+        throw std::runtime_error("Invalid status");
+    }
+}
+
+inline rocfft_precision rocfft_precision_from_fftparams(const fft_precision val)
+{
+    switch(val)
+    {
+    case fft_precision_single:
+        return rocfft_precision_single;
+    case fft_precision_double:
+        return rocfft_precision_double;
+    case fft_precision_half:
+        return rocfft_precision_half;
+    default:
+        throw std::runtime_error("Invalid precision");
+    }
+}
+
+inline rocfft_array_type rocfft_array_type_from_fftparams(const fft_array_type val)
+{
+    switch(val)
+    {
+    case fft_array_type_complex_interleaved:
+        return rocfft_array_type_complex_interleaved;
+    case fft_array_type_complex_planar:
+        return rocfft_array_type_complex_planar;
+    case fft_array_type_real:
+        return rocfft_array_type_real;
+    case fft_array_type_hermitian_interleaved:
+        return rocfft_array_type_hermitian_interleaved;
+    case fft_array_type_hermitian_planar:
+        return rocfft_array_type_hermitian_planar;
+    case fft_array_type_unset:
+        return rocfft_array_type_unset;
+    }
+    return rocfft_array_type_unset;
+}
+
+inline rocfft_transform_type rocfft_transform_type_from_fftparams(const fft_transform_type val)
+{
+    switch(val)
+    {
+    case fft_transform_type_complex_forward:
+        return rocfft_transform_type_complex_forward;
+    case fft_transform_type_complex_inverse:
+        return rocfft_transform_type_complex_inverse;
+    case fft_transform_type_real_forward:
+        return rocfft_transform_type_real_forward;
+    case fft_transform_type_real_inverse:
+        return rocfft_transform_type_real_inverse;
+    default:
+        throw std::runtime_error("Invalid transform type");
+    }
+}
+
+inline rocfft_result_placement
+    rocfft_result_placement_from_fftparams(const fft_result_placement val)
+{
+    switch(val)
+    {
+    case fft_placement_inplace:
+        return rocfft_placement_inplace;
+    case fft_placement_notinplace:
+        return rocfft_placement_notinplace;
+    default:
+        throw std::runtime_error("Invalid result placement");
+    }
+}
+
+class rocfft_params : public fft_params
+{
+public:
+    rocfft_plan             plan = nullptr;
+    rocfft_execution_info   info = nullptr;
+    rocfft_plan_description desc = nullptr;
+    gpubuf_t<void>          wbuffer;
+
+    explicit rocfft_params(){};
+
+    explicit rocfft_params(const fft_params& p)
+        : fft_params(p){};
+
+    rocfft_params(const rocfft_params&) = delete;
+    rocfft_params& operator=(const rocfft_params&) = delete;
+
+    ~rocfft_params()
+    {
+        free();
+    };
+
+    void free()
+    {
+        if(plan != nullptr)
+        {
+            rocfft_plan_destroy(plan);
+            plan = nullptr;
+        }
+        if(info != nullptr)
+        {
+            rocfft_execution_info_destroy(info);
+            info = nullptr;
+        }
+        if(desc != nullptr)
+        {
+            rocfft_plan_description_destroy(desc);
+            desc = nullptr;
+        }
+        wbuffer.free();
+    }
+
+    rocfft_precision get_rocfft_precision()
+    {
+        return rocfft_precision_from_fftparams(precision);
+    }
+
+    size_t vram_footprint() override
+    {
+        size_t val = fft_params::vram_footprint();
+        if(setup_structs() != fft_status_success)
+        {
+            throw std::runtime_error("Struct setup failed");
+        }
+        val += workbuffersize;
+
+        return val;
+    }
+
+    fft_status setup_structs()
+    {
+        rocfft_status fft_status = rocfft_status_success;
+        if(desc == nullptr)
+        {
+            rocfft_plan_description_create(&desc);
+            if(fft_status != rocfft_status_success)
+                return fft_status_from_rocfftparams(fft_status);
+
+            fft_status
+                = rocfft_plan_description_set_data_layout(desc,
+                                                          rocfft_array_type_from_fftparams(itype),
+                                                          rocfft_array_type_from_fftparams(otype),
+                                                          ioffset.data(),
+                                                          ooffset.data(),
+                                                          istride_cm().size(),
+                                                          istride_cm().data(),
+                                                          idist,
+                                                          ostride_cm().size(),
+                                                          ostride_cm().data(),
+                                                          odist);
+            if(fft_status != rocfft_status_success)
+            {
+                throw std::runtime_error("rocfft_plan_description_set_data_layout failed");
+            }
+
+            if(scale_factor != 1.0)
+            {
+                fft_status = rocfft_plan_description_set_scale_factor(desc, scale_factor);
+                if(fft_status != rocfft_status_success)
+                {
+                    throw std::runtime_error("rocfft_plan_description_set_scale_factor failed");
+                }
+            }
+        }
+
+        if(plan == nullptr)
+        {
+            fft_status = rocfft_plan_create(&plan,
+                                            rocfft_result_placement_from_fftparams(placement),
+                                            rocfft_transform_type_from_fftparams(transform_type),
+                                            get_rocfft_precision(),
+                                            length_cm().size(),
+                                            length_cm().data(),
+                                            nbatch,
+                                            desc);
+            if(fft_status != rocfft_status_success)
+            {
+                throw std::runtime_error("rocfft_plan_create failed");
+            }
+        }
+
+        if(info == nullptr)
+        {
+            fft_status = rocfft_execution_info_create(&info);
+            if(fft_status != rocfft_status_success)
+            {
+                throw std::runtime_error("rocfft_execution_info_create failed");
+            }
+        }
+
+        fft_status = rocfft_plan_get_work_buffer_size(plan, &workbuffersize);
+        if(fft_status != rocfft_status_success)
+        {
+            throw std::runtime_error("rocfft_plan_get_work_buffer_size failed");
+        }
+
+        return fft_status_from_rocfftparams(fft_status);
+    }
+
+    fft_status create_plan() override
+    {
+        fft_status ret = setup_structs();
+        if(ret != fft_status_success)
+        {
+            return ret;
+        }
+        if(workbuffersize > 0)
+        {
+            hipError_t hip_status = hipSuccess;
+            hip_status            = wbuffer.alloc(workbuffersize);
+            if(hip_status != hipSuccess)
+            {
+                std::ostringstream oss;
+                oss << "work buffer allocation failed (" << workbuffersize << " requested)";
+                size_t mem_free  = 0;
+                size_t mem_total = 0;
+                hip_status       = hipMemGetInfo(&mem_free, &mem_total);
+                if(hip_status == hipSuccess)
+                {
+                    oss << "free vram: " << mem_free << " total vram: " << mem_total;
+                }
+                else
+                {
+                    oss << "hipMemGetInfo also failed";
+                }
+                throw work_buffer_alloc_failure(oss.str());
+            }
+
+            auto rocret
+                = rocfft_execution_info_set_work_buffer(info, wbuffer.data(), workbuffersize);
+            if(rocret != rocfft_status_success)
+            {
+                throw std::runtime_error("rocfft_execution_info_set_work_buffer failed");
+            }
+        }
+
+        return ret;
+    }
+
+    fft_status set_callbacks(void* load_cb_host,
+                             void* load_cb_data,
+                             void* store_cb_host,
+                             void* store_cb_data) override
+    {
+        if(run_callbacks)
+        {
+            auto roc_status
+                = rocfft_execution_info_set_load_callback(info, &load_cb_host, &load_cb_data, 0);
+            if(roc_status != rocfft_status_success)
+                return fft_status_from_rocfftparams(roc_status);
+
+            roc_status
+                = rocfft_execution_info_set_store_callback(info, &store_cb_host, &store_cb_data, 0);
+            if(roc_status != rocfft_status_success)
+                return fft_status_from_rocfftparams(roc_status);
+        }
+        return fft_status_success;
+    }
+
+    fft_status execute(void** in, void** out) override
+    {
+        auto ret = rocfft_execute(plan, in, out, info);
+        return fft_status_from_rocfftparams(ret);
+    }
+};
+
+#endif
diff -Nru rocfft-5.5.0/solution_map/gfx908_rocfft_solution_map.dat rocfft-5.7.1/solution_map/gfx908_rocfft_solution_map.dat
--- rocfft-5.5.0/solution_map/gfx908_rocfft_solution_map.dat	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/solution_map/gfx908_rocfft_solution_map.dat	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,299 @@
+{"Version":1,
+"Data":[
+{"Problem":{"arch":"gfx908","token":"kernel_token_builtin_kernel"},
+ "Solutions":[ {"sol_node_type":"SOL_BUILTIN_KERNEL"}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len125_single_sbrr"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 125,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":false,"tpb":18,"wgs":450,"tpt":[ 25,0 ],"factors":[ 5,5,5 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 125,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":10,"wgs":250,"tpt":[ 25,0 ],"factors":[ 5,5,5 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len2187_single_sbrr"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 2187,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":1,"wgs":243,"tpt":[ 243,0 ],"factors":[ 9,9,3,3,3 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len243_single_sbrr"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 243,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":8,"wgs":216,"tpt":[ 27,0 ],"factors":[ 9,3,3,3 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 243,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":4,"wgs":108,"tpt":[ 27,0 ],"factors":[ 9,3,3,3 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len256_single_sbrr"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 256,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":4,"wgs":128,"tpt":[ 32,0 ],"factors":[ 4,2,8,4 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 256,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":4,"wgs":128,"tpt":[ 32,0 ],"factors":[ 8,2,8,2 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len4096_single_sbrr"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 4096,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":false,"tpb":2,"wgs":256,"tpt":[ 128,0 ],"factors":[ 8,16,4,8 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 4096,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":false,"tpb":2,"wgs":512,"tpt":[ 256,0 ],"factors":[ 8,8,16,4 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 4096,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":false,"tpb":2,"wgs":512,"tpt":[ 256,0 ],"factors":[ 4,8,8,4,4 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len56_double_sbrr"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 56,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":32,"wgs":256,"tpt":[ 8,0 ],"factors":[ 2,2,7,2 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 56,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":32,"wgs":256,"tpt":[ 8,0 ],"factors":[ 7,4,2 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len100_double_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 100,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":true,"tpb":12,"wgs":120,"tpt":[ 10,0 ],"factors":[ 10,5,2 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len125_single_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 125,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":true,"tpb":26,"wgs":130,"tpt":[ 5,0 ],"factors":[ 5,5,5 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 125,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":25,"wgs":125,"tpt":[ 5,0 ],"factors":[ 5,5,5 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len168_double_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 168,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":true,"half_lds":true,"dir_reg":true,"buffer_inst":true,"tpb":8,"wgs":64,"tpt":[ 8,0 ],"factors":[ 7,3,8 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 168,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":12,"wgs":168,"tpt":[ 14,0 ],"factors":[ 2,6,7,2 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 168,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":12,"wgs":168,"tpt":[ 14,0 ],"factors":[ 6,7,2,2 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 168,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":true,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":8,"wgs":168,"tpt":[ 21,0 ],"factors":[ 7,8,3 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 168,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":true,"half_lds":true,"dir_reg":true,"buffer_inst":true,"tpb":8,"wgs":112,"tpt":[ 14,0 ],"factors":[ 7,6,2,2 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 168,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":true,"half_lds":true,"dir_reg":true,"buffer_inst":true,"tpb":12,"wgs":252,"tpt":[ 21,0 ],"factors":[ 7,8,3 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len243_double_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 243,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":true,"half_lds":true,"dir_reg":true,"buffer_inst":true,"tpb":8,"wgs":216,"tpt":[ 27,0 ],"factors":[ 9,9,3 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len243_single_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 243,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":true,"tpb":15,"wgs":405,"tpt":[ 27,0 ],"factors":[ 3,3,9,3 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len336_double_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 336,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":4,"wgs":112,"tpt":[ 28,0 ],"factors":[ 2,7,6,4 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 336,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":false,"tpb":6,"wgs":126,"tpt":[ 21,0 ],"factors":[ 7,16,3 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 336,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":4,"wgs":112,"tpt":[ 28,0 ],"factors":[ 6,7,2,4 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 336,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":6,"wgs":126,"tpt":[ 21,0 ],"factors":[ 7,16,3 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len343_double_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 343,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":true,"half_lds":true,"dir_reg":true,"buffer_inst":false,"tpb":4,"wgs":196,"tpt":[ 49,0 ],"factors":[ 7,7,7 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len64_double_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 64,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":16,"wgs":64,"tpt":[ 4,0 ],"factors":[ 4,2,4,2 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 64,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":true,"tpb":16,"wgs":64,"tpt":[ 4,0 ],"factors":[ 2,4,4,2 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 64,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":true,"tpb":16,"wgs":64,"tpt":[ 4,0 ],"factors":[ 2,8,4 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 64,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":true,"tpb":16,"wgs":64,"tpt":[ 4,0 ],"factors":[ 4,4,2,2 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len81_single_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 81,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":false,"tpb":28,"wgs":252,"tpt":[ 9,0 ],"factors":[ 9,3,3 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 81,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":28,"wgs":252,"tpt":[ 9,0 ],"factors":[ 9,3,3 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 81,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":true,"tpb":28,"wgs":252,"tpt":[ 9,0 ],"factors":[ 3,9,3 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len96_double_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 96,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":16,"wgs":192,"tpt":[ 12,0 ],"factors":[ 8,6,2 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len100_double_sbrc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 100,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":20,"wgs":200,"tpt":[ 10,0 ],"factors":[ 5,10,2 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len112_double_sbrc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 112,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":8,"wgs":64,"tpt":[ 8,0 ],"factors":[ 4,7,2,2 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len128_double_sbrc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 128,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":16,"wgs":128,"tpt":[ 8,0 ],"factors":[ 4,4,4,2 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 128,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_UNALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":16,"wgs":128,"tpt":[ 8,0 ],"factors":[ 4,8,2,2 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len192_double_sbrc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 192,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":8,"wgs":192,"tpt":[ 24,0 ],"factors":[ 4,3,2,8 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len256_double_sbrc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 256,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":8,"wgs":128,"tpt":[ 16,0 ],"factors":[ 16,4,4 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 256,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":8,"wgs":128,"tpt":[ 16,0 ],"factors":[ 8,2,4,4 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 256,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":8,"wgs":128,"tpt":[ 16,0 ],"factors":[ 16,2,8 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len49_double_sbrc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 49,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_UNALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":28,"wgs":196,"tpt":[ 7,0 ],"factors":[ 7,7 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len81_double_sbrc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 81,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_UNALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":15,"wgs":135,"tpt":[ 9,0 ],"factors":[ 3,9,3 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len81_single_sbrc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 81,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_UNALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":36,"wgs":324,"tpt":[ 9,0 ],"factors":[ 3,9,3 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len336_double_sbcr"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 336,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CR","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":6,"wgs":168,"tpt":[ 28,0 ],"factors":[ 7,3,4,4 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len56_double_sbcr"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 56,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CR","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":16,"wgs":128,"tpt":[ 8,0 ],"factors":[ 7,4,2 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"kernel_len256_single_sbrc_xy_z"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 256,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM_TRANSPOSE_XY_Z","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":16,"wgs":256,"tpt":[ 16,0 ],"factors":[ 4,4,8,2 ]}}}
+               ]},
+{"Problem":{"arch":"gfx908","token":"125_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len125_single_sbrr","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len125_single_sbrr","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"2187_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len2187_single_sbrr","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"243_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len243_single_sbrr","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len243_single_sbrr","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"256_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len256_single_sbrr","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len256_single_sbrr","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"4096_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len4096_single_sbrr","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len4096_single_sbrr","child_option":1} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len4096_single_sbrr","child_option":2} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"56_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len56_double_sbrr","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len56_double_sbrr","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbcc_100_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len100_double_sbcc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbcc_125_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len125_single_sbcc","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len125_single_sbcc","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbcc_168_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len168_double_sbcc","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len168_double_sbcc","child_option":1} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len168_double_sbcc","child_option":2} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len168_double_sbcc","child_option":3} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len168_double_sbcc","child_option":4} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len168_double_sbcc","child_option":5} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbcc_243_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len243_double_sbcc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbcc_243_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len243_single_sbcc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbcc_336_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len336_double_sbcc","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len336_double_sbcc","child_option":1} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len336_double_sbcc","child_option":2} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len336_double_sbcc","child_option":3} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbcc_343_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len343_double_sbcc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbcc_64_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len64_double_sbcc","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len64_double_sbcc","child_option":1} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len64_double_sbcc","child_option":2} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len64_double_sbcc","child_option":3} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbcc_81_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len81_single_sbcc","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len81_single_sbcc","child_option":1} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len81_single_sbcc","child_option":2} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbcc_96_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len96_double_sbcc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbrc_100_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len100_double_sbrc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbrc_112_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len112_double_sbrc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbrc_128_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len128_double_sbrc","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len128_double_sbrc","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbrc_192_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len192_double_sbrc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbrc_256_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len256_double_sbrc","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len256_double_sbrc","child_option":1} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len256_double_sbrc","child_option":2} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbrc_49_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len49_double_sbrc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbrc_81_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len81_double_sbrc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbrc_81_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len81_single_sbrc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbcr_336_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CR","solution_childnodes":[ {"child_token":"kernel_len336_double_sbcr","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbcr_56_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CR","solution_childnodes":[ {"child_token":"kernel_len56_double_sbcr","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"leafnode_token_builtin_kernel"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_TRANSPOSE","solution_childnodes":[ {"child_token":"kernel_token_builtin_kernel","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_TRANSPOSE_XY_Z","solution_childnodes":[ {"child_token":"kernel_token_builtin_kernel","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_TRANSPOSE_Z_XY","solution_childnodes":[ {"child_token":"kernel_token_builtin_kernel","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"sbrc_xy_z_256_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_TRANSPOSE_XY_Z","solution_childnodes":[ {"child_token":"kernel_len256_single_sbrc_xy_z","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"14348907_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_TRTRT","solution_childnodes":[ {"child_token":"leafnode_token_builtin_kernel","child_option":0},{"child_token":"6561_sp_ip_complex","child_option":1},{"child_token":"leafnode_token_builtin_kernel","child_option":0},{"child_token":"2187_sp_ip_complex","child_option":1},{"child_token":"leafnode_token_builtin_kernel","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"16777216_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_TRTRT","solution_childnodes":[ {"child_token":"leafnode_token_builtin_kernel","child_option":0},{"child_token":"4096_sp_ip_complex","child_option":1},{"child_token":"leafnode_token_builtin_kernel","child_option":0},{"child_token":"4096_sp_ip_complex","child_option":2},{"child_token":"leafnode_token_builtin_kernel","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"10000_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_100_dp_ip_complex","child_option":0},{"child_token":"sbrc_100_dp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"10752_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_96_dp_ip_complex","child_option":0},{"child_token":"sbrc_112_dp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"16384_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_64_dp_ip_complex","child_option":3},{"child_token":"sbrc_256_dp_ip_complex","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"16807_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_343_dp_ip_complex","child_option":0},{"child_token":"sbrc_49_dp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"18816_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_168_dp_ip_complex","child_option":5},{"child_token":"sbrc_112_dp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"19683_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_243_dp_ip_complex","child_option":0},{"child_token":"sbrc_81_dp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"21504_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_168_dp_ip_complex","child_option":2},{"child_token":"sbrc_128_dp_ip_complex","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"32256_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_168_dp_ip_complex","child_option":2},{"child_token":"sbrc_192_dp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"43008_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_168_dp_ip_complex","child_option":4},{"child_token":"sbrc_256_dp_ip_complex","child_option":2} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"6561_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_81_sp_ip_complex","child_option":0},{"child_token":"sbrc_81_sp_ip_complex","child_option":0} ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_81_sp_ip_complex","child_option":1},{"child_token":"sbrc_81_sp_ip_complex","child_option":0} ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_81_sp_ip_complex","child_option":2},{"child_token":"sbrc_81_sp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"8192_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_64_dp_ip_complex","child_option":3},{"child_token":"sbrc_128_dp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"4096_4096_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_2D_RTRT","solution_childnodes":[ {"child_token":"4096_sp_ip_complex","child_option":3},{"child_token":"leafnode_token_builtin_kernel","child_option":0},{"child_token":"4096_sp_ip_complex","child_option":1},{"child_token":"leafnode_token_builtin_kernel","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"6561_6561_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_2D_RTRT","solution_childnodes":[ {"child_token":"6561_sp_ip_complex","child_option":2},{"child_token":"leafnode_token_builtin_kernel","child_option":0},{"child_token":"6561_sp_ip_complex","child_option":3},{"child_token":"leafnode_token_builtin_kernel","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"125_125_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_2D_RC","solution_childnodes":[ {"child_token":"125_sp_ip_complex","child_option":1},{"child_token":"sbcc_125_sp_ip_complex","child_option":0} ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_2D_RC","solution_childnodes":[ {"child_token":"125_sp_ip_complex","child_option":2},{"child_token":"sbcc_125_sp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"243_243_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_2D_RC","solution_childnodes":[ {"child_token":"243_sp_ip_complex","child_option":1},{"child_token":"sbcc_243_sp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"56_336_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_2D_RC","solution_childnodes":[ {"child_token":"56_dp_ip_complex","child_option":1},{"child_token":"sbcc_336_dp_ip_complex","child_option":0} ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_2D_RC","solution_childnodes":[ {"child_token":"56_dp_ip_complex","child_option":2},{"child_token":"sbcc_336_dp_ip_complex","child_option":2} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"243_243_243_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_3D_RTRT","solution_childnodes":[ {"child_token":"243_243_sp_ip_complex","child_option":1},{"child_token":"leafnode_token_builtin_kernel","child_option":1},{"child_token":"243_sp_ip_complex","child_option":2},{"child_token":"leafnode_token_builtin_kernel","child_option":2} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"56_336_336_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_3D_BLOCK_CR","solution_childnodes":[ {"child_token":"sbcr_336_dp_ip_complex","child_option":0},{"child_token":"sbcr_336_dp_ip_complex","child_option":0},{"child_token":"sbcr_56_dp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"125_125_125_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_3D_RC","solution_childnodes":[ {"child_token":"125_125_sp_ip_complex","child_option":2},{"child_token":"sbcc_125_sp_ip_complex","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx908","token":"56_336_336_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_3D_RC","solution_childnodes":[ {"child_token":"56_336_dp_ip_complex","child_option":2},{"child_token":"sbcc_336_dp_ip_complex","child_option":3} ]}
+               ]} ]
+}
\ No newline at end of file
diff -Nru rocfft-5.5.0/solution_map/gfx90a_rocfft_solution_map.dat rocfft-5.7.1/solution_map/gfx90a_rocfft_solution_map.dat
--- rocfft-5.5.0/solution_map/gfx90a_rocfft_solution_map.dat	1970-01-01 00:00:00.000000000 +0000
+++ rocfft-5.7.1/solution_map/gfx90a_rocfft_solution_map.dat	2023-08-09 16:19:51.000000000 +0000
@@ -0,0 +1,376 @@
+{"Version":1,
+"Data":[
+{"Problem":{"arch":"gfx90a","token":"kernel_token_builtin_kernel"},
+ "Solutions":[ {"sol_node_type":"SOL_BUILTIN_KERNEL"}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len125_single_sbrr"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 125,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":false,"tpb":10,"wgs":250,"tpt":[ 25,0 ],"factors":[ 5,5,5 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 125,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":10,"wgs":250,"tpt":[ 25,0 ],"factors":[ 5,5,5 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len2187_single_sbrr"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 2187,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":1,"wgs":243,"tpt":[ 243,0 ],"factors":[ 9,9,9,3 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 2187,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":false,"tpb":1,"wgs":243,"tpt":[ 243,0 ],"factors":[ 9,3,3,3,3,3 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len243_single_sbrr"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 243,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":false,"buffer_inst":false,"tpb":4,"wgs":108,"tpt":[ 27,0 ],"factors":[ 9,9,3 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 243,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":7,"wgs":189,"tpt":[ 27,0 ],"factors":[ 9,9,3 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len256_single_sbrr"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 256,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":4,"wgs":128,"tpt":[ 32,0 ],"factors":[ 2,4,4,8 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len4096_single_sbrr"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 4096,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":false,"tpb":2,"wgs":256,"tpt":[ 128,0 ],"factors":[ 8,16,4,8 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len56_double_sbrr"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 56,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":16,"wgs":128,"tpt":[ 8,0 ],"factors":[ 7,4,2 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 56,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":8,"wgs":64,"tpt":[ 8,0 ],"factors":[ 7,4,2 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 56,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":24,"wgs":192,"tpt":[ 8,0 ],"factors":[ 7,2,4 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 56,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":16,"wgs":128,"tpt":[ 8,0 ],"factors":[ 4,7,2 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len81_double_sbrr"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 81,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":21,"wgs":189,"tpt":[ 9,0 ],"factors":[ 9,3,3 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len100_double_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 100,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":true,"half_lds":true,"dir_reg":true,"buffer_inst":false,"tpb":12,"wgs":120,"tpt":[ 10,0 ],"factors":[ 10,2,5 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len125_double_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 125,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":true,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":9,"wgs":225,"tpt":[ 25,0 ],"factors":[ 5,5,5 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 125,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":true,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":10,"wgs":250,"tpt":[ 25,0 ],"factors":[ 5,5,5 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len125_single_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 125,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":32,"wgs":160,"tpt":[ 5,0 ],"factors":[ 5,5,5 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 125,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":20,"wgs":500,"tpt":[ 25,0 ],"factors":[ 5,5,5 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 125,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":20,"wgs":500,"tpt":[ 25,0 ],"factors":[ 5,5,5 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len168_double_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 168,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":true,"half_lds":true,"dir_reg":true,"buffer_inst":true,"tpb":12,"wgs":252,"tpt":[ 21,0 ],"factors":[ 7,8,3 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 168,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":true,"half_lds":true,"dir_reg":true,"buffer_inst":true,"tpb":12,"wgs":168,"tpt":[ 14,0 ],"factors":[ 7,6,2,2 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 168,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":true,"half_lds":true,"dir_reg":true,"buffer_inst":true,"tpb":12,"wgs":168,"tpt":[ 14,0 ],"factors":[ 2,7,6,2 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len224_double_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 224,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":9,"wgs":252,"tpt":[ 28,0 ],"factors":[ 7,2,4,4 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len243_double_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 243,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":true,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":7,"wgs":189,"tpt":[ 27,0 ],"factors":[ 3,9,3,3 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 243,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":true,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":7,"wgs":189,"tpt":[ 27,0 ],"factors":[ 9,3,3,3 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 243,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":8,"wgs":216,"tpt":[ 27,0 ],"factors":[ 9,9,3 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len243_single_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 243,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":16,"wgs":432,"tpt":[ 27,0 ],"factors":[ 3,3,9,3 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len336_double_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 336,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":6,"wgs":168,"tpt":[ 28,0 ],"factors":[ 4,7,3,4 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 336,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":false,"tpb":6,"wgs":126,"tpt":[ 21,0 ],"factors":[ 7,8,2,3 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 336,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":4,"wgs":112,"tpt":[ 28,0 ],"factors":[ 2,7,6,4 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 336,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":false,"tpb":6,"wgs":252,"tpt":[ 42,0 ],"factors":[ 3,7,8,2 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 336,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":6,"wgs":168,"tpt":[ 28,0 ],"factors":[ 7,3,4,4 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 336,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":4,"wgs":168,"tpt":[ 42,0 ],"factors":[ 3,7,8,2 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 336,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":true,"tpb":6,"wgs":252,"tpt":[ 42,0 ],"factors":[ 7,8,2,3 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 336,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":6,"wgs":126,"tpt":[ 21,0 ],"factors":[ 7,16,3 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len343_double_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 343,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":true,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":4,"wgs":196,"tpt":[ 49,0 ],"factors":[ 7,7,7 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len64_double_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 64,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":true,"half_lds":true,"dir_reg":true,"buffer_inst":false,"tpb":16,"wgs":64,"tpt":[ 4,0 ],"factors":[ 8,2,4 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 64,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":true,"half_lds":true,"dir_reg":true,"buffer_inst":true,"tpb":16,"wgs":64,"tpt":[ 4,0 ],"factors":[ 4,4,2,2 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len81_double_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 81,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":false,"buffer_inst":false,"tpb":21,"wgs":189,"tpt":[ 9,0 ],"factors":[ 3,9,3 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len81_single_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 81,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":true,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":21,"wgs":189,"tpt":[ 9,0 ],"factors":[ 9,3,3 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 81,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":true,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":21,"wgs":189,"tpt":[ 9,0 ],"factors":[ 3,9,3 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len96_double_sbcc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 96,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":16,"wgs":192,"tpt":[ 12,0 ],"factors":[ 8,3,4 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 96,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":true,"dir_reg":true,"buffer_inst":true,"tpb":16,"wgs":128,"tpt":[ 8,0 ],"factors":[ 4,2,3,4 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len100_double_sbrc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 100,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":20,"wgs":200,"tpt":[ 10,0 ],"factors":[ 10,5,2 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len112_double_sbrc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 112,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":16,"wgs":128,"tpt":[ 8,0 ],"factors":[ 7,2,8 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 112,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":16,"wgs":128,"tpt":[ 8,0 ],"factors":[ 2,4,7,2 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 112,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":8,"wgs":64,"tpt":[ 8,0 ],"factors":[ 7,2,8 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len125_double_sbrc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 125,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_UNALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":10,"wgs":250,"tpt":[ 25,0 ],"factors":[ 5,5,5 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len128_double_sbrc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 128,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":8,"wgs":128,"tpt":[ 16,0 ],"factors":[ 8,8,2 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 128,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":16,"wgs":128,"tpt":[ 8,0 ],"factors":[ 8,2,8 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len192_double_sbrc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 192,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":4,"wgs":64,"tpt":[ 16,0 ],"factors":[ 2,8,6,2 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 192,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":8,"wgs":128,"tpt":[ 16,0 ],"factors":[ 8,4,3,2 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 192,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":8,"wgs":192,"tpt":[ 24,0 ],"factors":[ 8,3,8 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len256_double_sbrc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 256,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":8,"wgs":256,"tpt":[ 32,0 ],"factors":[ 8,4,4,2 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len49_double_sbrc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 49,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_UNALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":false,"buffer_inst":false,"tpb":28,"wgs":196,"tpt":[ 7,0 ],"factors":[ 7,7 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len81_double_sbrc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 81,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_UNALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":22,"wgs":198,"tpt":[ 9,0 ],"factors":[ 9,3,3 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len81_single_sbrc"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 81,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_UNALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":false,"buffer_inst":false,"tpb":36,"wgs":324,"tpt":[ 9,0 ],"factors":[ 9,3,3 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 81,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_UNALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":36,"wgs":324,"tpt":[ 9,0 ],"factors":[ 9,3,3 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 81,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","sbrc_trans":"TILE_UNALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":36,"wgs":324,"tpt":[ 9,0 ],"factors":[ 3,9,3 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len336_double_sbcr"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 336,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CR","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":6,"wgs":252,"tpt":[ 42,0 ],"factors":[ 8,2,3,7 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 336,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CR","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":6,"wgs":252,"tpt":[ 42,0 ],"factors":[ 7,8,2,3 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 336,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CR","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":6,"wgs":252,"tpt":[ 42,0 ],"factors":[ 7,2,4,6 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 336,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CR","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":6,"wgs":252,"tpt":[ 42,0 ],"factors":[ 8,2,3,7 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len56_double_sbcr"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 56,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CR","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":true,"tpb":16,"wgs":128,"tpt":[ 8,0 ],"factors":[ 2,7,4 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 56,0 ],"precision":"double","scheme":"CS_KERNEL_STOCKHAM_BLOCK_CR","sbrc_trans":"NONE","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":16,"wgs":128,"tpt":[ 8,0 ],"factors":[ 2,7,4 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"kernel_len256_single_sbrc_xy_z"},
+ "Solutions":[ {"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 256,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM_TRANSPOSE_XY_Z","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":16,"wgs":256,"tpt":[ 16,0 ],"factors":[ 4,2,2,16 ]}}}
+              ,{"sol_node_type":"SOL_KERNEL_ONLY","kernel_key":{"lengths":[ 256,0 ],"precision":"single","scheme":"CS_KERNEL_STOCKHAM_TRANSPOSE_XY_Z","sbrc_trans":"TILE_ALIGNED","kernelConfig":{"use_3steps":false,"half_lds":false,"dir_reg":true,"buffer_inst":false,"tpb":16,"wgs":256,"tpt":[ 16,0 ],"factors":[ 8,2,16 ]}}}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"125_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len125_single_sbrr","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len125_single_sbrr","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"2187_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len2187_single_sbrr","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len2187_single_sbrr","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"243_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len243_single_sbrr","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len243_single_sbrr","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"256_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len256_single_sbrr","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"4096_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len4096_single_sbrr","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"56_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len56_double_sbrr","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len56_double_sbrr","child_option":1} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len56_double_sbrr","child_option":2} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len56_double_sbrr","child_option":3} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"81_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM","solution_childnodes":[ {"child_token":"kernel_len81_double_sbrr","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbcc_100_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len100_double_sbcc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbcc_125_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len125_double_sbcc","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len125_double_sbcc","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbcc_125_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len125_single_sbcc","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len125_single_sbcc","child_option":1} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len125_single_sbcc","child_option":2} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbcc_168_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len168_double_sbcc","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len168_double_sbcc","child_option":1} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len168_double_sbcc","child_option":2} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbcc_224_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len224_double_sbcc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbcc_243_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len243_double_sbcc","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len243_double_sbcc","child_option":1} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len243_double_sbcc","child_option":2} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbcc_243_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len243_single_sbcc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbcc_336_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len336_double_sbcc","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len336_double_sbcc","child_option":1} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len336_double_sbcc","child_option":2} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len336_double_sbcc","child_option":3} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len336_double_sbcc","child_option":4} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len336_double_sbcc","child_option":5} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len336_double_sbcc","child_option":6} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len336_double_sbcc","child_option":7} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbcc_343_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len343_double_sbcc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbcc_64_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len64_double_sbcc","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len64_double_sbcc","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbcc_81_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len81_double_sbcc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbcc_81_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len81_single_sbcc","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len81_single_sbcc","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbcc_96_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len96_double_sbcc","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CC","solution_childnodes":[ {"child_token":"kernel_len96_double_sbcc","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbrc_100_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len100_double_sbrc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbrc_112_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len112_double_sbrc","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len112_double_sbrc","child_option":1} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len112_double_sbrc","child_option":2} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbrc_125_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len125_double_sbrc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbrc_128_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len128_double_sbrc","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len128_double_sbrc","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbrc_192_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len192_double_sbrc","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len192_double_sbrc","child_option":1} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len192_double_sbrc","child_option":2} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbrc_256_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len256_double_sbrc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbrc_49_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len49_double_sbrc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbrc_81_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len81_double_sbrc","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbrc_81_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len81_single_sbrc","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len81_single_sbrc","child_option":1} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_RC","solution_childnodes":[ {"child_token":"kernel_len81_single_sbrc","child_option":2} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbcr_336_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CR","solution_childnodes":[ {"child_token":"kernel_len336_double_sbcr","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CR","solution_childnodes":[ {"child_token":"kernel_len336_double_sbcr","child_option":1} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CR","solution_childnodes":[ {"child_token":"kernel_len336_double_sbcr","child_option":2} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CR","solution_childnodes":[ {"child_token":"kernel_len336_double_sbcr","child_option":3} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbcr_56_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CR","solution_childnodes":[ {"child_token":"kernel_len56_double_sbcr","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_BLOCK_CR","solution_childnodes":[ {"child_token":"kernel_len56_double_sbcr","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"leafnode_token_builtin_kernel"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_TRANSPOSE","solution_childnodes":[ {"child_token":"kernel_token_builtin_kernel","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_TRANSPOSE_XY_Z","solution_childnodes":[ {"child_token":"kernel_token_builtin_kernel","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_TRANSPOSE_Z_XY","solution_childnodes":[ {"child_token":"kernel_token_builtin_kernel","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"sbrc_xy_z_256_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_TRANSPOSE_XY_Z","solution_childnodes":[ {"child_token":"kernel_len256_single_sbrc_xy_z","child_option":0} ]}
+              ,{"sol_node_type":"SOL_LEAF_NODE","using_scheme":"CS_KERNEL_STOCKHAM_TRANSPOSE_XY_Z","solution_childnodes":[ {"child_token":"kernel_len256_single_sbrc_xy_z","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"14348907_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_TRTRT","solution_childnodes":[ {"child_token":"leafnode_token_builtin_kernel","child_option":0},{"child_token":"6561_sp_ip_complex","child_option":3},{"child_token":"leafnode_token_builtin_kernel","child_option":0},{"child_token":"2187_sp_ip_complex","child_option":2},{"child_token":"leafnode_token_builtin_kernel","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"16777216_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_TRTRT","solution_childnodes":[ {"child_token":"leafnode_token_builtin_kernel","child_option":0},{"child_token":"4096_sp_ip_complex","child_option":1},{"child_token":"leafnode_token_builtin_kernel","child_option":0},{"child_token":"4096_sp_ip_complex","child_option":1},{"child_token":"leafnode_token_builtin_kernel","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"10000_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_100_dp_ip_complex","child_option":0},{"child_token":"sbrc_100_dp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"10752_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_96_dp_ip_complex","child_option":1},{"child_token":"sbrc_112_dp_ip_complex","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"15625_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_125_dp_ip_complex","child_option":1},{"child_token":"sbrc_125_dp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"16384_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_64_dp_ip_complex","child_option":1},{"child_token":"sbrc_256_dp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"16807_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_343_dp_ip_complex","child_option":0},{"child_token":"sbrc_49_dp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"18816_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_168_dp_ip_complex","child_option":0},{"child_token":"sbrc_112_dp_ip_complex","child_option":2} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"21504_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_168_dp_ip_complex","child_option":0},{"child_token":"sbrc_128_dp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"32256_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_168_dp_ip_complex","child_option":2},{"child_token":"sbrc_192_dp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"43008_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_224_dp_ip_complex","child_option":0},{"child_token":"sbrc_192_dp_ip_complex","child_option":2} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"6561_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_81_sp_ip_complex","child_option":0},{"child_token":"sbrc_81_sp_ip_complex","child_option":0} ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_81_sp_ip_complex","child_option":0},{"child_token":"sbrc_81_sp_ip_complex","child_option":1} ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_81_sp_ip_complex","child_option":1},{"child_token":"sbrc_81_sp_ip_complex","child_option":2} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"8192_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_L1D_CC","solution_childnodes":[ {"child_token":"sbcc_64_dp_ip_complex","child_option":0},{"child_token":"sbrc_128_dp_ip_complex","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"4096_4096_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_2D_RTRT","solution_childnodes":[ {"child_token":"4096_sp_ip_complex","child_option":1},{"child_token":"leafnode_token_builtin_kernel","child_option":0},{"child_token":"4096_sp_ip_complex","child_option":1},{"child_token":"leafnode_token_builtin_kernel","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"6561_6561_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_2D_RTRT","solution_childnodes":[ {"child_token":"6561_sp_ip_complex","child_option":2},{"child_token":"leafnode_token_builtin_kernel","child_option":0},{"child_token":"6561_sp_ip_complex","child_option":2},{"child_token":"leafnode_token_builtin_kernel","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"125_125_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_2D_RC","solution_childnodes":[ {"child_token":"125_sp_ip_complex","child_option":1},{"child_token":"sbcc_125_sp_ip_complex","child_option":0} ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_2D_RC","solution_childnodes":[ {"child_token":"125_sp_ip_complex","child_option":2},{"child_token":"sbcc_125_sp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"243_243_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_2D_RC","solution_childnodes":[ {"child_token":"243_sp_ip_complex","child_option":1},{"child_token":"sbcc_243_sp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"56_336_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_2D_RC","solution_childnodes":[ {"child_token":"56_dp_ip_complex","child_option":1},{"child_token":"sbcc_336_dp_ip_complex","child_option":0} ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_2D_RC","solution_childnodes":[ {"child_token":"56_dp_ip_complex","child_option":2},{"child_token":"sbcc_336_dp_ip_complex","child_option":2} ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_2D_RC","solution_childnodes":[ {"child_token":"56_dp_ip_complex","child_option":3},{"child_token":"sbcc_336_dp_ip_complex","child_option":4} ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_2D_RC","solution_childnodes":[ {"child_token":"56_dp_ip_complex","child_option":4},{"child_token":"sbcc_336_dp_ip_complex","child_option":5} ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_2D_RC","solution_childnodes":[ {"child_token":"56_dp_ip_complex","child_option":1},{"child_token":"sbcc_336_dp_ip_complex","child_option":7} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"81_81_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_DUMMY","using_scheme":"CS_NONE","solution_childnodes":[  ]}
+              ,{"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_2D_RC","solution_childnodes":[ {"child_token":"81_dp_ip_complex","child_option":1},{"child_token":"sbcc_81_dp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"81_81_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_2D_RC","solution_childnodes":[ {"child_token":"81_dp_ip_complex","child_option":1},{"child_token":"sbcc_81_dp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"243_243_243_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_3D_RTRT","solution_childnodes":[ {"child_token":"243_243_sp_ip_complex","child_option":1},{"child_token":"leafnode_token_builtin_kernel","child_option":1},{"child_token":"243_sp_ip_complex","child_option":2},{"child_token":"leafnode_token_builtin_kernel","child_option":2} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"256_256_256_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_3D_BLOCK_RC","solution_childnodes":[ {"child_token":"sbrc_xy_z_256_sp_ip_complex","child_option":0},{"child_token":"sbrc_xy_z_256_sp_ip_complex","child_option":1},{"child_token":"256_sp_ip_complex","child_option":1},{"child_token":"leafnode_token_builtin_kernel","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"56_336_336_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_3D_BLOCK_CR","solution_childnodes":[ {"child_token":"sbcr_336_dp_ip_complex","child_option":2},{"child_token":"sbcr_336_dp_ip_complex","child_option":3},{"child_token":"sbcr_56_dp_ip_complex","child_option":1} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"56_336_336_dp_op_complex_fwd_batch_1_istride_1_56_18816_ostride_1_56_18816_idist_6322176_odist_6322176_ioffset_0_ooffset_0"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_3D_BLOCK_CR","solution_childnodes":[ {"child_token":"sbcr_336_dp_ip_complex","child_option":0},{"child_token":"sbcr_336_dp_ip_complex","child_option":1},{"child_token":"sbcr_56_dp_ip_complex","child_option":0} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"125_125_125_sp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_3D_RC","solution_childnodes":[ {"child_token":"125_125_sp_ip_complex","child_option":2},{"child_token":"sbcc_125_sp_ip_complex","child_option":2} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"56_336_336_dp_ip_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_3D_RC","solution_childnodes":[ {"child_token":"56_336_dp_ip_complex","child_option":5},{"child_token":"sbcc_336_dp_ip_complex","child_option":6} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"56_336_336_dp_ip_complex_fwd_batch_1_istride_1_56_18816_ostride_1_56_18816_idist_6322176_odist_6322176_ioffset_0_ooffset_0"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_3D_RC","solution_childnodes":[ {"child_token":"56_336_dp_ip_complex","child_option":4},{"child_token":"sbcc_336_dp_ip_complex","child_option":6} ]}
+               ]},
+{"Problem":{"arch":"gfx90a","token":"81_81_81_dp_op_complex"},
+ "Solutions":[ {"sol_node_type":"SOL_INTERNAL_NODE","using_scheme":"CS_3D_RC","solution_childnodes":[ {"child_token":"81_81_dp_ip_complex","child_option":1},{"child_token":"sbcc_81_dp_ip_complex","child_option":0} ]}
+               ]} ]
+}
\ No newline at end of file